TFM: SUPERMARKET SALES 2019

1. Carga de librerias

# carga de librerias
library(plyr)
library(dplyr)
library(caret)
library(class)
library(cluster)
library(clValid)
library(cvms)
library(dataPreparation)
library(data.table)
library(DMwR)
library(extrafont)
loadfonts(device = "win")
library(e1071)
library(fastDummies)
library(factoextra)
library(FactoMineR)
library(fpc)
library(gbm)
library(ggcorrplot)
library(ggimage)
library(ggplot2)
library(ggradar)
library(gmodels)
library(gridExtra)
library(Hmisc)
library(hrbrthemes)
library(inspectdf) 
library(knitr)
library(kableExtra)
library(lubridate)
library(MASS)
library(MLmetrics)
library(modeest)
library(NbClust)
library(purrr)
library(randomForest)
library(ranger)
library(readr)
library(ROCR)
library(rpart)
library(rsvg)
library(skimr)
library(tidyverse)
library(TraMineR)
library(vegan)
library(viridis)
library(xgboost)
library(tidyr)

2. Carga del conjunto de datos

rutacsv <- "https://raw.githubusercontent.com/Juanmick/TFM/master/supermarket_sales%20-%20Sheet1.csv"

supermarket <- read.csv(rutacsv)

3. Análisis básico del conjunto de datos

head(supermarket)
##    Invoice.ID Branch      City Customer.type Gender           Product.line
## 1 750-67-8428      A    Yangon        Member Female      Health and beauty
## 2 226-31-3081      C Naypyitaw        Normal Female Electronic accessories
## 3 631-41-3108      A    Yangon        Normal   Male     Home and lifestyle
## 4 123-19-1176      A    Yangon        Member   Male      Health and beauty
## 5 373-73-7910      A    Yangon        Normal   Male      Sports and travel
## 6 699-14-3026      C Naypyitaw        Normal   Male Electronic accessories
##   Unit.price Quantity  Tax.5.    Total      Date  Time     Payment   cogs
## 1      74.69        7 26.1415 548.9715  1/5/2019 13:08     Ewallet 522.83
## 2      15.28        5  3.8200  80.2200  3/8/2019 10:29        Cash  76.40
## 3      46.33        7 16.2155 340.5255  3/3/2019 13:23 Credit card 324.31
## 4      58.22        8 23.2880 489.0480 1/27/2019 20:33     Ewallet 465.76
## 5      86.31        7 30.2085 634.3785  2/8/2019 10:37     Ewallet 604.17
## 6      85.39        7 29.8865 627.6165 3/25/2019 18:30     Ewallet 597.73
##   gross.margin.percentage gross.income Rating
## 1                4.761905      26.1415    9.1
## 2                4.761905       3.8200    9.6
## 3                4.761905      16.2155    7.4
## 4                4.761905      23.2880    8.4
## 5                4.761905      30.2085    5.3
## 6                4.761905      29.8865    4.1
str(supermarket)
## 'data.frame':    1000 obs. of  17 variables:
##  $ Invoice.ID             : chr  "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
##  $ Branch                 : chr  "A" "C" "A" "A" ...
##  $ City                   : chr  "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
##  $ Customer.type          : chr  "Member" "Normal" "Normal" "Member" ...
##  $ Gender                 : chr  "Female" "Female" "Male" "Male" ...
##  $ Product.line           : chr  "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
##  $ Unit.price             : num  74.7 15.3 46.3 58.2 86.3 ...
##  $ Quantity               : int  7 5 7 8 7 7 6 10 2 3 ...
##  $ Tax.5.                 : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Total                  : num  549 80.2 340.5 489 634.4 ...
##  $ Date                   : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Time                   : chr  "13:08" "10:29" "13:23" "20:33" ...
##  $ Payment                : chr  "Ewallet" "Cash" "Credit card" "Ewallet" ...
##  $ cogs                   : num  522.8 76.4 324.3 465.8 604.2 ...
##  $ gross.margin.percentage: num  4.76 4.76 4.76 4.76 4.76 ...
##  $ gross.income           : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Rating                 : num  9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
glimpse(supermarket)
## Rows: 1,000
## Columns: 17
## $ Invoice.ID              <chr> "750-67-8428", "226-31-3081", "631-41-3108"...
## $ Branch                  <chr> "A", "C", "A", "A", "A", "C", "A", "C", "A"...
## $ City                    <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", ...
## $ Customer.type           <chr> "Member", "Normal", "Normal", "Member", "No...
## $ Gender                  <chr> "Female", "Female", "Male", "Male", "Male",...
## $ Product.line            <chr> "Health and beauty", "Electronic accessorie...
## $ Unit.price              <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39, 6...
## $ Quantity                <int> 7, 5, 7, 8, 7, 7, 6, 10, 2, 3, 4, 4, 5, 10,...
## $ Tax.5.                  <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085,...
## $ Total                   <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634....
## $ Date                    <chr> "1/5/2019", "3/8/2019", "3/3/2019", "1/27/2...
## $ Time                    <chr> "13:08", "10:29", "13:23", "20:33", "10:37"...
## $ Payment                 <chr> "Ewallet", "Cash", "Credit card", "Ewallet"...
## $ cogs                    <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597....
## $ gross.margin.percentage <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.7...
## $ gross.income            <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085,...
## $ Rating                  <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1, 5.8, 8.0, 7.2...
describe(supermarket) 
## supermarket 
## 
##  17  Variables      1000  Observations
## --------------------------------------------------------------------------------
## Invoice.ID 
##        n  missing distinct 
##     1000        0     1000 
## 
## lowest : 101-17-6199 101-81-4070 102-06-2002 102-77-2261 105-10-6182
## highest: 894-41-5205 895-03-6665 895-66-0685 896-34-0956 898-04-2717
## --------------------------------------------------------------------------------
## Branch 
##        n  missing distinct 
##     1000        0        3 
##                             
## Value          A     B     C
## Frequency    340   332   328
## Proportion 0.340 0.332 0.328
## --------------------------------------------------------------------------------
## City 
##        n  missing distinct 
##     1000        0        3 
##                                         
## Value       Mandalay Naypyitaw    Yangon
## Frequency        332       328       340
## Proportion     0.332     0.328     0.340
## --------------------------------------------------------------------------------
## Customer.type 
##        n  missing distinct 
##     1000        0        2 
##                         
## Value      Member Normal
## Frequency     501    499
## Proportion  0.501  0.499
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##     1000        0        2 
##                         
## Value      Female   Male
## Frequency     501    499
## Proportion  0.501  0.499
## --------------------------------------------------------------------------------
## Product.line 
##        n  missing distinct 
##     1000        0        6 
## 
## lowest : Electronic accessories Fashion accessories    Food and beverages     Health and beauty      Home and lifestyle    
## highest: Fashion accessories    Food and beverages     Health and beauty      Home and lifestyle     Sports and travel     
##                                                                                
## Value      Electronic accessories    Fashion accessories     Food and beverages
## Frequency                     170                    178                    174
## Proportion                  0.170                  0.178                  0.174
##                                                                                
## Value           Health and beauty     Home and lifestyle      Sports and travel
## Frequency                     152                    160                    166
## Proportion                  0.152                  0.160                  0.166
## --------------------------------------------------------------------------------
## Unit.price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0      943        1    55.67     30.6    15.28    19.31 
##      .25      .50      .75      .90      .95 
##    32.88    55.23    77.94    93.12    97.22 
## 
## lowest : 10.08 10.13 10.16 10.17 10.18, highest: 99.82 99.83 99.89 99.92 99.96
## --------------------------------------------------------------------------------
## Quantity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0       10     0.99     5.51     3.36        1        1 
##      .25      .50      .75      .90      .95 
##        3        5        8       10       10 
## 
## lowest :  1  2  3  4  5, highest:  6  7  8  9 10
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency    112    91    90   109   102    98   102    85    92   119
## Proportion 0.112 0.091 0.090 0.109 0.102 0.098 0.102 0.085 0.092 0.119
## --------------------------------------------------------------------------------
## Tax.5. 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0      990        1    15.38    12.89    1.956    3.243 
##      .25      .50      .75      .90      .95 
##    5.925   12.088   22.445   34.234   39.166 
## 
## lowest :  0.5085  0.6045  0.6270  0.6390  0.6990
## highest: 48.6900 48.7500 49.2600 49.4900 49.6500
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0      990        1      323    270.7    41.07    68.10 
##      .25      .50      .75      .90      .95 
##   124.42   253.85   471.35   718.91   822.50 
## 
## lowest :   10.6785   12.6945   13.1670   13.4190   14.6790
## highest: 1022.4900 1023.7500 1034.4600 1039.2900 1042.6500
## --------------------------------------------------------------------------------
## Date 
##        n  missing distinct 
##     1000        0       89 
## 
## lowest : 1/1/2019  1/10/2019 1/11/2019 1/12/2019 1/13/2019
## highest: 3/5/2019  3/6/2019  3/7/2019  3/8/2019  3/9/2019 
## --------------------------------------------------------------------------------
## Time 
##        n  missing distinct 
##     1000        0      506 
## 
## lowest : 10:00 10:01 10:02 10:03 10:04, highest: 20:52 20:54 20:55 20:57 20:59
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct 
##     1000        0        3 
##                                               
## Value             Cash Credit card     Ewallet
## Frequency          344         311         345
## Proportion       0.344       0.311       0.345
## --------------------------------------------------------------------------------
## cogs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0      990        1    307.6    257.8    39.11    64.86 
##      .25      .50      .75      .90      .95 
##   118.50   241.76   448.91   684.68   783.33 
## 
## lowest :  10.17  12.09  12.54  12.78  13.98, highest: 973.80 975.00 985.20 989.80 993.00
## --------------------------------------------------------------------------------
## gross.margin.percentage 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        1        0    4.762        0 
##                    
## Value      4.761905
## Frequency      1000
## Proportion        1
## --------------------------------------------------------------------------------
## gross.income 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0      990        1    15.38    12.89    1.956    3.243 
##      .25      .50      .75      .90      .95 
##    5.925   12.088   22.445   34.234   39.166 
## 
## lowest :  0.5085  0.6045  0.6270  0.6390  0.6990
## highest: 48.6900 48.7500 49.2600 49.4900 49.6500
## --------------------------------------------------------------------------------
## Rating 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0       61        1    6.973    1.985    4.295    4.500 
##      .25      .50      .75      .90      .95 
##    5.500    7.000    8.500    9.400    9.700 
## 
## lowest :  4.0  4.1  4.2  4.3  4.4, highest:  9.6  9.7  9.8  9.9 10.0
## --------------------------------------------------------------------------------
#mediana precio unidad 55.23
#mediana cantidad 5
#mediana rating 7
summary(supermarket)
##   Invoice.ID           Branch              City           Customer.type     
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     Gender          Product.line         Unit.price       Quantity    
##  Length:1000        Length:1000        Min.   :10.08   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.:32.88   1st Qu.: 3.00  
##  Mode  :character   Mode  :character   Median :55.23   Median : 5.00  
##                                        Mean   :55.67   Mean   : 5.51  
##                                        3rd Qu.:77.94   3rd Qu.: 8.00  
##                                        Max.   :99.96   Max.   :10.00  
##      Tax.5.            Total             Date               Time          
##  Min.   : 0.5085   Min.   :  10.68   Length:1000        Length:1000       
##  1st Qu.: 5.9249   1st Qu.: 124.42   Class :character   Class :character  
##  Median :12.0880   Median : 253.85   Mode  :character   Mode  :character  
##  Mean   :15.3794   Mean   : 322.97                                        
##  3rd Qu.:22.4453   3rd Qu.: 471.35                                        
##  Max.   :49.6500   Max.   :1042.65                                        
##    Payment               cogs        gross.margin.percentage  gross.income    
##  Length:1000        Min.   : 10.17   Min.   :4.762           Min.   : 0.5085  
##  Class :character   1st Qu.:118.50   1st Qu.:4.762           1st Qu.: 5.9249  
##  Mode  :character   Median :241.76   Median :4.762           Median :12.0880  
##                     Mean   :307.59   Mean   :4.762           Mean   :15.3794  
##                     3rd Qu.:448.90   3rd Qu.:4.762           3rd Qu.:22.4453  
##                     Max.   :993.00   Max.   :4.762           Max.   :49.6500  
##      Rating      
##  Min.   : 4.000  
##  1st Qu.: 5.500  
##  Median : 7.000  
##  Mean   : 6.973  
##  3rd Qu.: 8.500  
##  Max.   :10.000
#Ningún NA's
sapply(supermarket, function(x) sum(is.na(x)))
##              Invoice.ID                  Branch                    City 
##                       0                       0                       0 
##           Customer.type                  Gender            Product.line 
##                       0                       0                       0 
##              Unit.price                Quantity                  Tax.5. 
##                       0                       0                       0 
##                   Total                    Date                    Time 
##                       0                       0                       0 
##                 Payment                    cogs gross.margin.percentage 
##                       0                       0                       0 
##            gross.income                  Rating 
##                       0                       0

4. Explorando el conjunto de datos

#REALIZAMOS EL ANALISIS EDA DEL CONJUNTO DE DATOS PARA HACERNOS UNA IDEA DE LAS VARIABLES

# categorical plot / variables categóricas
x <- inspect_cat(supermarket) 
show_plot(x)

# correlations in numeric columns / correlación en columnas númericas
x <- inspect_cor(supermarket)
show_plot(x)

# feature imbalance bar plot / niveles más comunes en variables categoricas
x <- inspect_imb(supermarket)
show_plot(x)

# memory usage barplot / uso de memoria
x <- inspect_mem(supermarket)
show_plot(x)

# missingness barplot / datos ausentes
x <- inspect_na(supermarket)
show_plot(x)

# histograms for numeric columns / histogramas para variables numericas
x <- inspect_num(supermarket)
show_plot(x)

# barplot of column types / tipos de columnas
x <- inspect_types(supermarket)
show_plot(x)

5. Tratamiento del conjunto de datos

supermarket$Date1 <- as.Date(supermarket$Date, "%m/%d/%Y")

temp <- "https://github.com/Juanmick/TFM/blob/master/temp.rds?raw=true"

temperaturas <- readRDS(url(temp))

#OUTER JOIN PARA AÑADIR COLUMNAS DE TEMPERATURA
supermarket <- merge(x = supermarket, y = temperaturas, by = c("Date1","City" ), all = TRUE)

#Unimos columnas de hora y fecha
supermarket<-unite(supermarket, datetime,c(12:13),  sep = " ")

#Convertimos en formato as.POSIXct
supermarket$datetime <- as.POSIXct(supermarket$datetime,format="%m/%d/%Y %H:%M")

#Extraemos el dia de la semana en número
supermarket$day <- wday(supermarket$datetime)

#Extraemos el mes del año en número
supermarket$month <- month(supermarket$datetime)

#Extraemos la semana del año
supermarket$week <- week(supermarket$datetime)

#Extraemos la hora de la compra
supermarket$hour <- hour(supermarket$datetime)

#Extraemos el día del mes en número
supermarket$daynum <- day(supermarket$datetime)

#Simplificamos las categorias de la variable Product Line
supermarket$Product.line[supermarket$Product.line == "Health and beauty"] <- "Health&Beauty"
supermarket$Product.line[supermarket$Product.line == "Electronic accessories"] <- "Electronic"
supermarket$Product.line[supermarket$Product.line == "Home and lifestyle"] <- "Home&Lifestyle"
supermarket$Product.line[supermarket$Product.line == "Sports and travel"] <- "Sports&Travel"
supermarket$Product.line[supermarket$Product.line == "Food and beverages"] <- "Food&Beverages"
supermarket$Product.line[supermarket$Product.line == "Fashion accessories"] <- "Fashion_accessories"

#ELIMINAR
supermarket$Date1 <- NULL

#eliminamos la columna por ser igual que city
supermarket$Branch <- NULL

#eliminamos la columna por ser constante
supermarket$gross.margin.percentage <- NULL

#eliminamos la columna por ser igual que tax
supermarket$gross.income <- NULL

#eliminamos la columna por no aportar nada
supermarket$Invoice.ID <- NULL

#Eliminamos NA's generados por la variable temperatura
supermarket <- na.omit(supermarket)

#guardamos el dataset 
saveRDS(supermarket, file = "supermarket.rds")

6. Análisis descriptivo

6.1 Datos de análisis y tablas

# ANALISIS SI EL CLIENTE ES MIEMBRO

miembros <- filter(supermarket, Customer.type == "Member")

summary(miembros)
##      City           Customer.type         Gender          Product.line      
##  Length:501         Length:501         Length:501         Length:501        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Unit.price       Quantity          Tax.5.            Total        
##  Min.   :10.08   Min.   : 1.000   Min.   : 0.5085   Min.   :  10.68  
##  1st Qu.:32.25   1st Qu.: 3.000   1st Qu.: 5.6310   1st Qu.: 118.25  
##  Median :56.04   Median : 5.000   Median :12.6680   Median : 266.03  
##  Mean   :56.21   Mean   : 5.559   Mean   :15.6091   Mean   : 327.79  
##  3rd Qu.:79.93   3rd Qu.: 8.000   3rd Qu.:23.1225   3rd Qu.: 485.57  
##  Max.   :99.96   Max.   :10.000   Max.   :49.6500   Max.   :1042.65  
##     datetime                     Payment               cogs       
##  Min.   :2019-01-01 11:36:00   Length:501         Min.   : 10.17  
##  1st Qu.:2019-01-24 17:37:00   Class :character   1st Qu.:112.62  
##  Median :2019-02-12 17:49:00   Mode  :character   Median :253.36  
##  Mean   :2019-02-14 05:45:02                      Mean   :312.18  
##  3rd Qu.:2019-03-06 15:31:00                      3rd Qu.:462.45  
##  Max.   :2019-03-30 20:37:00                      Max.   :993.00  
##      Rating           tmed            day            month      
##  Min.   : 4.00   Min.   :19.80   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 5.40   1st Qu.:25.00   1st Qu.:3.000   1st Qu.:1.000  
##  Median : 7.00   Median :27.00   Median :4.000   Median :2.000  
##  Mean   : 6.94   Mean   :26.67   Mean   :4.064   Mean   :1.988  
##  3rd Qu.: 8.50   3rd Qu.:28.30   3rd Qu.:6.000   3rd Qu.:3.000  
##  Max.   :10.00   Max.   :34.00   Max.   :7.000   Max.   :3.000  
##       week             hour           daynum     
##  Min.   : 1.000   Min.   :10.00   Min.   : 1.00  
##  1st Qu.: 4.000   1st Qu.:12.00   1st Qu.: 7.00  
##  Median : 7.000   Median :15.00   Median :15.00  
##  Mean   : 6.944   Mean   :14.97   Mean   :14.96  
##  3rd Qu.:10.000   3rd Qu.:18.00   3rd Qu.:23.00  
##  Max.   :13.000   Max.   :20.00   Max.   :31.00
#mediana precio unidad  56.04
#mediana cantidad 5
#mediana rating 7
#mediana total 266

IngresosMiembros = sum(miembros$Total) 
IngresosMiembros #164223
## [1] 164223.4
CantidadesMiembros = sum(miembros$Quantity) 
CantidadesMiembros #2785
## [1] 2785
ModaMiembroGenero = mlv(miembros$Gender, method = "mfv") 
ModaMiembroGenero #mujeres
## [1] "Female"
kable(table(miembros$Gender),caption = "Frecuencia Genero(Miembro)") 
Frecuencia Genero(Miembro)
Var1 Freq
Female 261
Male 240
#261 mujeres
ModaMiembroProductos = mlv(miembros$Product.line, method = "mfv") 
ModaMiembroProductos #Food&Beverages 94
## [1] "Food&Beverages"
kable(table(miembros$Product.line),caption = "Frecuencia Productos(Miembro)")
Frecuencia Productos(Miembro)
Var1 Freq
Electronic 78
Fashion_accessories 86
Food&Beverages 94
Health&Beauty 73
Home&Lifestyle 83
Sports&Travel 87
kable(table(miembros$Payment),caption = "Frecuencia Pago(Miembro)") 
Frecuencia Pago(Miembro)
Var1 Freq
Cash 168
Credit card 172
Ewallet 161
#172 Credit card
kable(table(miembros$day),caption = "Frecuencia dias(Miembro)") 
Frecuencia dias(Miembro)
Var1 Freq
1 66
2 59
3 90
4 76
5 64
6 64
7 82
#90 martes
kable(table(hour(miembros$datetime)),caption = "Frecuencia Horas(Miembro)") 
Frecuencia Horas(Miembro)
Var1 Freq
10 42
11 46
12 46
13 49
14 48
15 56
16 37
17 36
18 45
19 61
20 35
# 19:00 61compras 
ModaMiembroRating = mlv(miembros$Rating, method = "mfv") 
ModaMiembroRating #6.6 (14)y 9.5 (14)
## [1] 6.6 9.5
kable(table(miembros$Rating),caption = "Frecuencia Rating(Miembro)")
Frecuencia Rating(Miembro)
Var1 Freq
4 7
4.1 9
4.2 11
4.3 11
4.4 7
4.5 10
4.6 6
4.7 8
4.8 6
4.9 10
5 13
5.1 8
5.2 7
5.3 6
5.4 8
5.5 8
5.6 11
5.7 6
5.8 7
5.9 9
6 12
6.1 7
6.2 6
6.3 5
6.4 6
6.5 10
6.6 14
6.7 9
6.8 6
6.9 7
7 13
7.1 7
7.2 6
7.3 12
7.4 5
7.5 8
7.6 9
7.7 8
7.8 8
7.9 7
8 11
8.1 8
8.2 5
8.3 5
8.4 11
8.5 12
8.6 9
8.7 10
8.8 8
8.9 9
9 6
9.1 8
9.2 6
9.3 5
9.4 6
9.5 14
9.6 7
9.7 8
9.8 11
9.9 7
10 2
# ANALISIS SI EL CLIENTE ES NORMAL

normal <- filter(supermarket, Customer.type == "Normal")
summary(normal)
##      City           Customer.type         Gender          Product.line      
##  Length:499         Length:499         Length:499         Length:499        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Unit.price       Quantity          Tax.5.            Total        
##  Min.   :10.56   Min.   : 1.000   Min.   : 0.6045   Min.   :  12.69  
##  1st Qu.:33.23   1st Qu.: 3.000   1st Qu.: 6.1540   1st Qu.: 129.23  
##  Median :54.28   Median : 5.000   Median :11.3060   Median : 237.43  
##  Mean   :55.14   Mean   : 5.461   Mean   :15.1487   Mean   : 318.12  
##  3rd Qu.:76.46   3rd Qu.: 8.000   3rd Qu.:22.0290   3rd Qu.: 462.61  
##  Max.   :99.96   Max.   :10.000   Max.   :49.4900   Max.   :1039.29  
##     datetime                     Payment               cogs       
##  Min.   :2019-01-01 10:39:00   Length:499         Min.   : 12.09  
##  1st Qu.:2019-01-24 19:23:00   Class :character   1st Qu.:123.08  
##  Median :2019-02-15 12:44:00   Mode  :character   Median :226.12  
##  Mean   :2019-02-15 01:18:12                      Mean   :302.97  
##  3rd Qu.:2019-03-08 20:19:30                      3rd Qu.:440.58  
##  Max.   :2019-03-30 14:58:00                      Max.   :989.80  
##      Rating            tmed            day            month      
##  Min.   : 4.000   Min.   :19.80   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 5.700   1st Qu.:25.00   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 7.000   Median :27.00   Median :4.000   Median :2.000  
##  Mean   : 7.005   Mean   :26.72   Mean   :4.138   Mean   :1.998  
##  3rd Qu.: 8.400   3rd Qu.:28.40   3rd Qu.:6.000   3rd Qu.:3.000  
##  Max.   :10.000   Max.   :34.00   Max.   :7.000   Max.   :3.000  
##       week             hour           daynum     
##  Min.   : 1.000   Min.   :10.00   Min.   : 1.00  
##  1st Qu.: 4.000   1st Qu.:12.00   1st Qu.: 8.00  
##  Median : 7.000   Median :15.00   Median :15.00  
##  Mean   : 7.048   Mean   :14.85   Mean   :15.55  
##  3rd Qu.:10.000   3rd Qu.:18.00   3rd Qu.:23.00  
##  Max.   :13.000   Max.   :20.00   Max.   :31.00
#mediana precio unidad 54.28
#mediana cantidad 5
#mediana rating 7
#mediana total 237.43

IngresosNormal = sum(normal$Total) 
IngresosNormal #158743.3
## [1] 158743.3
CantidadesNormal = sum(normal$Quantity) 
CantidadesNormal #2725
## [1] 2725
ModaNormalGenero = mlv(normal$Gender, method = "mfv") 
ModaNormalGenero #hombres
## [1] "Male"
kable(table(normal$Gender),caption = "Frecuencia Genero(Normal)") #259 hombres
Frecuencia Genero(Normal)
Var1 Freq
Female 240
Male 259
ModaNormalProductos = mlv(normal$Product.line, method = "mfv") 
ModaNormalProductos #Electronic 92 y Fashion_accessories 92
## [1] "Electronic"          "Fashion_accessories"
kable(table(normal$Product.line),caption = "Frecuencia Productos(Normal)")
Frecuencia Productos(Normal)
Var1 Freq
Electronic 92
Fashion_accessories 92
Food&Beverages 80
Health&Beauty 79
Home&Lifestyle 77
Sports&Travel 79
kable(table(normal$Payment),caption = "Frecuencia Pagos(Normal)") #184 ewallet
Frecuencia Pagos(Normal)
Var1 Freq
Cash 176
Credit card 139
Ewallet 184
kable(table(normal$day),caption = "Frecuencia Dias(Normal)") #82 sabado
Frecuencia Dias(Normal)
Var1 Freq
1 67
2 66
3 68
4 67
5 74
6 75
7 82
kable(table(hour(normal$datetime)),caption = "Frecuencia Horas(Normal)") #  10:00 59 compras
Frecuencia Horas(Normal)
Var1 Freq
10 59
11 44
12 43
13 54
14 35
15 46
16 40
17 38
18 48
19 52
20 40
ModaNormalRating = mlv(normal$Rating, method = "mfv") 
ModaNormalRating #6.2
## [1] 6.2
# Total de ingresos cada día
tapply(supermarket$Total, supermarket$day, FUN=sum)
##        1        2        3        4        5        6        7 
## 44457.89 37899.08 51482.25 43731.14 45349.25 43926.34 56120.81
# Total de ingresos cada día y de total de cantidad de unidades vendidas
aggregate(cbind(supermarket$Total,supermarket$Quantity), by=list(day=supermarket$day), FUN=sum)
##   day       V1  V2
## 1   1 44457.89 778
## 2   2 37899.08 638
## 3   3 51482.25 862
## 4   4 43731.14 800
## 5   5 45349.25 755
## 6   6 43926.34 758
## 7   7 56120.81 919
# Valoracion media cada día
aggregate(supermarket$Rating, by=list(day=supermarket$day), FUN=mean)
##   day        x
## 1   1 7.011278
## 2   2 7.153600
## 3   3 7.003165
## 4   4 6.805594
## 5   5 6.889855
## 6   6 7.076259
## 7   7 6.901829
# Cantidades vendidas cada dia
kable(addmargins(table(supermarket$Quantity, supermarket$day)),caption = "Cantidades vendidas cada día") %>%
  kable_styling("striped", "condensed", full_width = F) %>%
  column_spec(1, bold = T) %>%
  row_spec(0, bold = T) %>%
  row_spec(11, bold = T, color = "white", background = "#D7261E")%>%
  column_spec(9, bold = T, color = "white", background = "#D7261E")
Cantidades vendidas cada día
1 2 3 4 5 6 7 Sum
1 11 19 16 16 14 18 18 112
2 12 6 19 16 15 10 13 91
3 10 13 14 11 13 14 15 90
4 12 19 17 11 14 15 21 109
5 16 15 15 17 12 15 12 102
6 12 10 18 15 15 14 14 98
7 16 16 12 11 14 12 21 102
8 15 7 10 12 16 10 15 85
9 9 11 19 13 10 15 15 92
10 20 9 18 21 15 16 20 119
Sum 133 125 158 143 138 139 164 1000
# Porcentaje de ventas cada día
kable(addmargins(prop.table(table(supermarket$Quantity, supermarket$day))*100),caption = "Porcentaje de ventas al día") %>%
  kable_styling("striped", "condensed", full_width = F) %>%
  column_spec(1, bold = T) %>%
  row_spec(0, bold = T) %>%
  row_spec(11, bold = T, color = "white", background = "#D7261E")%>%
  column_spec(9, bold = T, color = "white", background = "#D7261E")
Porcentaje de ventas al día
1 2 3 4 5 6 7 Sum
1 1.1 1.9 1.6 1.6 1.4 1.8 1.8 11.2
2 1.2 0.6 1.9 1.6 1.5 1.0 1.3 9.1
3 1.0 1.3 1.4 1.1 1.3 1.4 1.5 9.0
4 1.2 1.9 1.7 1.1 1.4 1.5 2.1 10.9
5 1.6 1.5 1.5 1.7 1.2 1.5 1.2 10.2
6 1.2 1.0 1.8 1.5 1.5 1.4 1.4 9.8
7 1.6 1.6 1.2 1.1 1.4 1.2 2.1 10.2
8 1.5 0.7 1.0 1.2 1.6 1.0 1.5 8.5
9 0.9 1.1 1.9 1.3 1.0 1.5 1.5 9.2
10 2.0 0.9 1.8 2.1 1.5 1.6 2.0 11.9
Sum 13.3 12.5 15.8 14.3 13.8 13.9 16.4 100.0

6.2 Gráficos

windowsFonts("Arial" = windowsFont("Arial"))


supermarket$date <- as.Date(supermarket$datetime)


#Agrupar por días y mes con la suma de Total y Quantity

pordias <- supermarket %>% 
  group_by(date,month) %>% 
  summarise(Total = sum(Total), Quantity = sum(Quantity))
## `summarise()` regrouping output by 'date' (override with `.groups` argument)
# GRAFICO DE INGRESOS TOTALES

ggplot(data = pordias, aes(x = date, y = Total)) + 
  geom_line(color = "#00AFBB", size = 1) +
  stat_smooth(method="loess", colour="red") + 
  geom_hline(yintercept = mean(pordias$Total),linetype="dashed", color="blue") +
  annotate(geom="text", x=as.Date("2019-01-01"), y=3628,label="Media", size=2.5) +
  xlab("Mes") +
  theme_ipsum() + 
  ggtitle("Ingresos totales")
## `geom_smooth()` using formula 'y ~ x'

#GRAFICO DE CANTIDADES TOTALES VENDIDAS

ggplot(data = pordias, aes(x = date, y = Quantity)) + 
  geom_line(color = "#00AFBB", size = 1) +
  stat_smooth(method="loess", colour="red") + 
  geom_hline(yintercept = mean(pordias$Quantity),linetype="dashed", color="blue") +
  annotate(geom="text", x=as.Date("2019-01-01"), y=61.91,label="Media", size=2.5) +
  xlab("Mes") +
  theme_ipsum() + 
  ggtitle("Cantidades totales vendidas")
## `geom_smooth()` using formula 'y ~ x'

#AGRUPAMOS POR FECHA, MES Y CIUDAD CON SUMAS TOTALES DE INGRESOS Y CANTIDADES

pormes <- supermarket %>% 
  group_by(date,month,City) %>% 
  summarise(Total = sum(Total), Quantity = sum(Quantity))
## `summarise()` regrouping output by 'date', 'month' (override with `.groups` argument)
#INGRESOS TOTALES POR MES CON FACETAS DE CIUDADES
ggplot(data = pormes, aes(x = date, y = Total)) + 
  geom_line()+stat_smooth(method="loess", colour="red")+facet_wrap(~City,scale = "free") +
  geom_hline(yintercept = mean(pormes$Total),linetype="dashed", color="blue") +
  xlab("Mes") +
  theme_ipsum() + 
  ggtitle("Ingresos totales por mes según ciudades")
## `geom_smooth()` using formula 'y ~ x'

#CANTIDADES POR MES CON FACETAS DE CIUDADES
ggplot(data = pormes, aes(x = date, y = Quantity)) + 
  geom_line() +
  stat_smooth(method="loess", colour="red") +
  facet_wrap(~City,scale = "free") +
  geom_hline(yintercept = mean(pormes$Quantity),linetype="dashed", color="blue") +
  xlab("Mes") +
  theme_ipsum() + 
  ggtitle("Cantidades totales por mes según ciudades")
## `geom_smooth()` using formula 'y ~ x'

#AGRUPAMOS POR DIA DE LA SEMANA E INGRESOS Y CANTIDADES TOTALES

pordia <- supermarket %>% 
  group_by(day) %>% 
  summarise(Total = sum(Total), Quantity = sum(Quantity))
## `summarise()` ungrouping output (override with `.groups` argument)
pordia$day <- as.factor(pordia$day)

#GRAFICO INGRESOS TOTALES CADA DIA DE LA SEMANA

ggplot(data = pordia, aes(x = day, y = Total, fill=day)) + 
  geom_col(color = "#00AFBB", size = 1) + 
  geom_hline(yintercept = mean(pordia$Total),linetype="dashed", color="blue") +
  xlab("Dias semana") +
  theme_ipsum() + 
  scale_x_discrete(breaks = c(1,2,3,4,5,6,7), labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) + 
  theme(axis.text.x = element_text(angle = 45, size=9)) +
  coord_flip() + 
  scale_fill_discrete(name = "Días", labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) +
  scale_y_continuous(breaks = c(0,10000,20000,30000,40000,50000))+
  ggtitle("Ingresos totales cada dia de la semana")

#GRAFICO CANTIDADES TOTALES VENDIDAS CADA DIA DE LA SEMANA

ggplot(data = pordia, aes(x = day, y = Quantity, fill=day)) + 
  geom_col(color = "#00AFBB", size = 1) + 
  geom_hline(yintercept = mean(pordia$Quantity),linetype="dashed", color="blue") +
  xlab("Dias semana") +
  theme_ipsum() + 
  scale_x_discrete(breaks = c(1,2,3,4,5,6,7), labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) + 
  theme(axis.text.x = element_text(angle = 45, size=9)) +
  coord_flip() + 
  scale_fill_discrete(name = "Días", labels = c("Lunes", "Martes", 'Miercoles','Jueves','Viernes','Sabado','Domingo')) +
  scale_y_continuous(breaks = c(0,200,400,600,800)) +
  ggtitle("Cantidades totales vendidas según dia de la semana")

#NUMERO DE TRANSACCIONES A LA HORA POR MES

monthly_trend <- ddply(supermarket, .(Hour = supermarket$hour, Month = supermarket$month), nrow)
monthly_trend$Month <- as.factor(monthly_trend$Month)

#GRAFICO DE LINEAS

ggplot(monthly_trend, aes(Hour, V1, group=Month)) +
  geom_line(aes(color=Month),size =2) +
  ggtitle(label = "Transacciones cada hora según mes") +
  theme_minimal() +
  theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold")) +
  xlab("Hour") +
  ylab("Número de transacciones") +
  geom_hline(yintercept = mean(monthly_trend$V1))

#GRAFICO DE BARRAS

ggplot(monthly_trend, aes(Hour, V1, fill=Month)) +
  geom_bar(stat = "identity") +
  ggtitle(label = "Transacciones cada hora según mes") +
  theme_minimal() +
  theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold"))+
  xlab("Hour") +
  ylab("Número de transacciones")

#NUMERO DE TRANSACCIONES A LA HORA POR DIA

daily_trend <- ddply(supermarket, .(Hour = supermarket$hour, Day = supermarket$day), nrow)
daily_trend$Day <- as.factor(daily_trend$Day)

#GRAFICO DE LINEAS

ggplot(daily_trend, aes(Hour, V1, group=Day)) +
  geom_line(aes(color=Day), size =2)+ggtitle(label = "Transacciones cada hora según día de la semana") +
  theme_minimal() +
  theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold")) +
  xlab("Hour") +
  ylab("Numero de transacciones") +
  geom_hline(yintercept = mean(daily_trend$V1))

#GRAFICO DE BARRAS

ggplot(daily_trend, aes(Hour, V1, fill=Day)) +
  geom_bar(stat = "identity") +
  ggtitle(label = "Transacciones cada hora según día de la semana") +
  theme_minimal() +
  theme(plot.title = element_text(hjust=0.5, lineheight = .8, face = "bold")) +
  xlab("Hour") +
  ylab("Numero de transacciones")

ANALISIS CLIENTES

#GENERO/PRODUCTO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Product.line)) + 
    geom_boxplot(
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
    facet_wrap(~Customer.type) +theme_ipsum() 

#GENERO/PRODUCTO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Product.line)) + 
    geom_boxplot(outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
    facet_wrap(~Customer.type)+theme_ipsum() 

#GENERO/PRODUCTO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Product.line)) + 
    geom_boxplot(outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
    facet_wrap(~Customer.type) +theme_ipsum() 

##########################

#PAGO/GENERO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Payment)) + 
    geom_boxplot(outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
    facet_wrap(~Customer.type) +theme_ipsum() 

#PAGO/GENERO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Payment)) + 
    geom_boxplot(outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red")+
    facet_wrap(~Customer.type)+theme_ipsum() 

#PAGO/GENERO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Payment)) + 
    geom_boxplot(outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red") +
    facet_wrap(~Customer.type)+theme_ipsum() 

#############################
#PRECIO

ggplot(supermarket, aes(x=as.factor(Gender), y=Unit.price, fill = Payment)) + 
    geom_boxplot(outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red") +
    facet_wrap(~Customer.type)+theme_ipsum() 

ggplot(supermarket, aes(x=as.factor(Gender), y=Unit.price, fill = Product.line)) + 
    geom_boxplot(outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=10, color="red", fill="red") +
    facet_wrap(~Customer.type)+theme_ipsum() 

ggplot(supermarket, aes(x=as.factor(Quantity), y=Unit.price))+geom_boxplot()

#########################################################
#LO MISMO QUE LO ANTERIOR PERO CON VIOLINES

#PAGO/ GENERO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Payment)) + 
    geom_violin()+ 
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
    facet_wrap(~Customer.type)+
  coord_flip()+
  scale_fill_viridis(discrete=TRUE)+
  scale_color_viridis(discrete=TRUE)+
  theme_ipsum() +
  theme(legend.position="none")+
  ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#PAGO/GENERO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Payment)) + 
    geom_violin()+ 
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
    facet_wrap(~Customer.type)+
  coord_flip()+
  scale_fill_viridis(discrete=TRUE)+
  scale_color_viridis(discrete=TRUE)+
  theme_ipsum() +
  theme(legend.position="none")+
  ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#PAGO/GENERO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Payment)) + 
    geom_violin()+ 
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
    facet_wrap(~Customer.type)+
  coord_flip()+
  scale_fill_viridis(discrete=TRUE)+
  scale_color_viridis(discrete=TRUE)+
  theme_ipsum() +
  theme(legend.position="none")+
  ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#GENERO/PRODUCTO / TOTAL POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Total, fill = Product.line)) + 
    geom_violin()+ 
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
    facet_wrap(~Customer.type)+
  coord_flip()+
  scale_fill_viridis(discrete=TRUE)+
  scale_color_viridis(discrete=TRUE)+
  theme_ipsum() +
  theme(legend.position="none")+
  ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#GENERO/PRODUCTO / QUANTITY POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Quantity, fill = Product.line)) + 
    geom_violin()+ 
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
    facet_wrap(~Customer.type)+
  coord_flip()+
  scale_fill_viridis(discrete=TRUE)+
  scale_color_viridis(discrete=TRUE)+
  theme_ipsum() +
  theme(legend.position="none")+
  ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

#GENERO/PRODUCTO / RATING POR TIPO CLIENTE
ggplot(supermarket, aes(x=as.factor(Gender), y=Rating, fill = Product.line)) + 
    geom_violin()+ 
    xlab("Gender")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red") +
    facet_wrap(~Customer.type)+
  coord_flip()+
  scale_fill_viridis(discrete=TRUE)+
  scale_color_viridis(discrete=TRUE)+
  theme_ipsum() +
  theme(legend.position="none")+
  ggtitle("A Violin wrapping a boxplot")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

ANALISIS DE RELACIONES

###################################################

#RELACION ENTRE CANTIDAD Y TOTAL

#CANTIDAD/GENERO / TOTAL  #IMPORTANTE MUESTRA RELACION ENTRE A MÁS CANTIDAD MÁS INGRESOS
ggplot(supermarket, aes(x=as.factor(Quantity), y=Total)) + 
    geom_boxplot(fill = '#99d8c9', outlier.colour="red",# custom outliers
        outlier.fill="red",
        outlier.size=3,
      varwidth = TRUE)+ #tamaño proporcional
    xlab("Cantidad")+
    geom_jitter(color="black", size=0.4, alpha=0.9) +  scale_fill_viridis(discrete = TRUE, alpha=0.6)+            stat_summary(fun=mean, geom="point", shape=20, size=7, color="red", fill="red")+
  ggtitle("Relación Quantity/Total")+
  theme_ipsum()

#RELACION ENTRE PRECIO Y TOTAL

ggplot(supermarket, aes(x=Unit.price, y=Total,))+geom_jitter()+geom_smooth()+ggtitle("Relación Price/Total")+
  theme_ipsum()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#NO RELACION ENTRE CANTIDAD Y PRECIO

ggplot(supermarket, aes(x=Quantity, y=Unit.price, fill=Product.line))+geom_jitter()+geom_smooth()+ggtitle("Relación Quantity/Price")+
  theme_ipsum()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(supermarket, aes(x=hour, y=Total, fill=Product.line))+geom_jitter()+geom_smooth()+ggtitle("Relación Total/Hora")+
  theme_ipsum()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

7. Clustering

# LEVES TRANSFORMACIONES

#Convertimos a dummy las variables categoricas

results <- fastDummies::dummy_cols(supermarket,remove_first_dummy = TRUE)

#Eliminamos las variables antiguas para quedarnos con los dummys
results[,1:4] <- NULL
results$Payment <- NULL
results$datetime <- NULL
results$date <- NULL


# Escalamos el dataset

resultsca <- scale(results)

7.1 Hallar mejor K para métodos no jerárquicos

#Establecemos semilla
set.seed(20) # para reproducir el mismo ejemplo

#HALLAR MEJOR K PARA NUESTROS CLUSTER, DIFERENTES METODOS:

#Criterio del codo
wss <- (nrow(resultsca)-1)*sum(apply(resultsca,2,var)) 
for (i in 2:10) wss[i] <- sum(kmeans(resultsca,centers=i)$withinss) 
# 10 nº máximo de clusters a analizar 
plot(1:10, wss, type="b", xlab="Número de Clusters",ylab="Suma de cuadrados dentro de los clusters",main="Cálculo del número óptimo de clusters con el criterio del codo")

# Elbow method 
fviz_nbclust(resultsca, kmeans, method = "wss") +
    geom_vline(xintercept = 3, linetype = 2)+
  labs(subtitle = "Elbow method")

# Silhouette method

fviz_nbclust(resultsca, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

# Gap statistic

fviz_nbclust(resultsca, kmeans, nstart = 25,  method = "gap_stat", nboot = 50)+
  labs(subtitle = "Gap statistic method")

# Criterio Calinski  2 y 3

model <- cascadeKM(resultsca, 1, 10, iter = 100)
plot(model, sortg = TRUE)

model$results[2,]
##  1 groups  2 groups  3 groups  4 groups  5 groups  6 groups  7 groups  8 groups 
##        NA 140.18042 115.62061  92.01490  80.72781  74.10937  68.89688  64.83061 
##  9 groups 10 groups 
##  60.60041  56.96084
# METODO GRAFICO CON INDEX DINDEX
# method ward.D2
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="ward.D2", index="dindex")

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
## 
## $All.index
##      2      3      4      5      6      7      8      9     10 
## 4.5206 4.4159 4.3172 4.2293 4.1510 4.0990 4.0596 4.0219 3.9824
# method kmeans
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="kmeans", index="dindex")

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
## 
## $All.index
##      2      3      4      5      6      7      8      9     10 
## 4.4667 4.3000 4.2326 4.1733 4.0877 4.0408 3.9545 3.9151 3.8840
# METODO GRAFICO CON INDEX HUBERT
# Method Ward.D2
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="ward.D2", index="hubert")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 
## $All.index
##     2     3     4     5     6     7     8     9    10 
## 3e-04 2e-04 2e-04 2e-04 3e-04 3e-04 3e-04 3e-04 3e-04
# Method kmeans
NbClust(resultsca, distance="euclidean", min.nc=2, max.nc=10, method="kmeans", index="hubert")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 
## $All.index
##     2     3     4     5     6     7     8     9    10 
## 2e-04 3e-04 3e-04 3e-04 3e-04 3e-04 3e-04 3e-04 3e-04
#NUMERO K RECOMENDADO SEGÚN 17 INDEX DISTINTOS MEDIANTE UN GRID

gridk <- expand.grid(
  method = c('kmeans','ward.D2','average'),
  index = c('sdbw','sdindex','frey','ptbiserial','ratkowsky','beale','pseudot2','duda','db','cindex','kl','ball','ch','hartigan','dunn','gap','mcclain'),
  numk = 0
)

for(i in 1:nrow(gridk)) {
  res <- NbClust(resultsca, distance = "euclidean", min.nc = 2, max.nc = 10, method = gridk$method[i], index = gridk$index[i])
gridk$numk[i] <- res$Best.nc
}

table(gridk$numk)
## 
##  1  2  3  5  6  7  8  9 10 
##  1 22  8  3  3  2  4  1  7
#el numero de k=2  es el más recomendado

7.2 Método K-Means

#MODELO CON KMEANS


set.seed(20)
k.means.fit <- kmeans(resultsca,2,25)

print(k.means.fit)
## K-means clustering with 2 clusters of sizes 658, 342
## 
## Cluster means:
##   Unit.price   Quantity     Tax.5.      Total       cogs      Rating
## 1 -0.3907778 -0.4219030 -0.6134235 -0.6134235 -0.6134235  0.02481673
## 2  0.7518474  0.8117316  1.1802124  1.1802124  1.1802124 -0.04774681
##          tmed         day       month        week         hour       daynum
## 1  0.04618716 -0.00565389  0.04841003  0.04970702 -0.003710147  0.005340918
## 2 -0.08886301  0.01087795 -0.09313976 -0.09563514  0.007138237 -0.010275802
##   City_Naypyitaw City_Yangon Customer.type_Normal Gender_Male
## 1    -0.02207874  0.02013748           0.02326500  0.03541700
## 2     0.04247898 -0.03874404          -0.04476132 -0.06814148
##   Product.line_Fashion_accessories Product.line_Food&Beverages
## 1                       0.01142089                  0.01806243
## 2                      -0.02197352                 -0.03475169
##   Product.line_Health&Beauty Product.line_Home&Lifestyle
## 1               -0.004298638                -0.005303556
## 2                0.008270478                 0.010203918
##   Product.line_Sports&Travel Payment_Credit card Payment_Ewallet
## 1                -0.01726056         0.007750807    -0.009618185
## 2                 0.03320890        -0.014912372     0.018505164
## 
## Clustering vector:
##    [1] 2 2 1 1 1 1 2 1 2 2 2 2 2 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1
##   [38] 2 1 1 1 2 1 1 2 1 1 2 1 2 2 1 1 1 1 2 2 1 1 1 1 2 1 1 1 2 1 1 2 2 1 2 1 2
##   [75] 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 1 1 2 2 2 2 2 1 2 1 1 1 1 1 1 1 2 1 2 2 1 1
##  [112] 1 2 2 2 1 1 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 1 2 2 1 1 2 1 2 2 1 2 2 2 2 2 1
##  [149] 2 1 2 1 1 2 1 1 2 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 1 1 1 2 1 2 1 1 1 2 1
##  [186] 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 2 2 1 2 2 1 1 1 2 1 2 1 2 1 2 1 1 1 1
##  [223] 1 2 1 2 1 1 2 1 1 1 1 1 2 2 2 2 1 2 1 1 2 1 2 1 2 2 1 1 2 1 2 2 2 1 1 2 1
##  [260] 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 1 1 2 1
##  [297] 2 2 2 2 1 1 1 2 1 1 2 2 2 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 1 2 1
##  [334] 1 1 1 2 1 1 1 2 1 1 2 2 2 1 2 1 2 1 1 2 1 2 2 1 1 2 2 1 1 1 1 1 2 2 1 1 1
##  [371] 2 2 1 1 2 1 1 2 1 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 1 2 1 1 2 2 1 1 2 1 1 1 1
##  [408] 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 2 1 2 2 1 2 1 2 2
##  [445] 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2
##  [482] 2 2 2 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 2 2 1
##  [519] 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 2 2 1 2 1 1 2 1 2 1 1 1 1 1 1
##  [556] 1 2 2 1 2 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1
##  [593] 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 1 2 1 1 1 2 2 1 1 1 1 2 1 2 1
##  [630] 2 1 2 1 1 1 1 1 1 2 1 2 2 2 1 2 1 2 1 2 2 1 1 2 2 1 1 1 1 1 2 2 1 1 2 1 1
##  [667] 1 2 1 1 2 1 2 1 2 1 1 2 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2
##  [704] 1 1 2 1 2 1 1 1 2 2 1 2 1 2 1 1 2 2 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1
##  [741] 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 1 1 2 1 2 2 2 2 2 1 1 1 2 2 2 2 1 1 1 1 1
##  [778] 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 2 1 2 1 1 1 2 1 2 1 2 1 1 2 1 1 1 1 1 2
##  [815] 1 1 1 1 1 1 2 1 2 2 2 1 1 2 2 2 1 1 2 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1
##  [852] 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 2 1 2 1 2
##  [889] 1 2 1 1 2 1 2 2 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 2 1 1 2 1 2 2 2 1 1 2 1
##  [926] 1 1 2 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2
##  [963] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 2 2 1 2 2 1 2 1 2 1 1 2 1
## [1000] 1
## 
## Within cluster sum of squares by cluster:
## [1] 13224.50  6922.61
##  (between_SS / total_SS =  12.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
attributes(k.means.fit)
## $names
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## $class
## [1] "kmeans"
# tamaño de cada cluster
k.means.fit$size
## [1] 658 342
# dibujo del cluster
plotcluster(resultsca, k.means.fit$cluster)

fviz_cluster(k.means.fit, data = resultsca)

# Comprobando graficamente como queda con cada K

k2 <- kmeans(resultsca, centers = 2, nstart = 25)
k3 <- kmeans(resultsca, centers = 3, nstart = 25)
k4 <- kmeans(resultsca, centers = 4, nstart = 25)
k5 <- kmeans(resultsca, centers = 5, nstart = 25)
p1 <- fviz_cluster(k2, geom = "point", data = resultsca) + ggtitle('k = 2')
p2 <- fviz_cluster(k3, geom = "point", data = resultsca) + ggtitle('k = 3')
p3 <- fviz_cluster(k4, geom = "point", data = resultsca) + ggtitle('k = 4')
p4 <- fviz_cluster(k5, geom = "point", data = resultsca) + ggtitle('k = 5')

grid.arrange(p1, p2, p3, p4, nrow=2)

supe <- "https://github.com/Juanmick/TFM/blob/master/supermarket.rds?raw=true"

supermarket <- readRDS(url(supe))

# Creamos columna con los clientes que pertenecen a cada cluster
supermarket$cluster <- k.means.fit$cluster

df <- supermarket %>% group_by(cluster) %>%
  summarise(mean=mean(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
dfkm <- supermarket
table(dfkm$cluster)
## 
##   1   2 
## 658 342
#saveRDS(dfkm, file = "dfkm.rds")

#dfkm <- readRDS("C:/TFM/dfkm.rds")

# Caracteristicas cluster 1
dfkm1 <- filter(dfkm, cluster == 1)
describe(dfkm1)
## dfkm1 
## 
##  19  Variables      658  Observations
## --------------------------------------------------------------------------------
## City 
##        n  missing distinct 
##      658        0        3 
##                                         
## Value       Mandalay Naypyitaw    Yangon
## Frequency        219       209       230
## Proportion     0.333     0.318     0.350
## --------------------------------------------------------------------------------
## Customer.type 
##        n  missing distinct 
##      658        0        2 
##                         
## Value      Member Normal
## Frequency     322    336
## Proportion  0.489  0.511
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      658        0        2 
##                         
## Value      Female   Male
## Frequency     318    340
## Proportion  0.483  0.517
## --------------------------------------------------------------------------------
## Product.line 
##        n  missing distinct 
##      658        0        6 
## 
## lowest : Electronic          Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle     
## highest: Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle      Sports&Travel      
##                                                                       
## Value               Electronic Fashion_accessories      Food&Beverages
## Frequency                  111                 120                 119
## Proportion               0.169               0.182               0.181
##                                                                       
## Value            Health&Beauty      Home&Lifestyle       Sports&Travel
## Frequency                   99                 104                 105
## Proportion               0.150               0.158               0.160
## --------------------------------------------------------------------------------
## Unit.price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0      626        1    45.32    27.77    13.58    16.24 
##      .25      .50      .75      .90      .95 
##    24.75    40.57    62.86    82.98    92.19 
## 
## lowest : 10.08 10.13 10.16 10.17 10.18, highest: 99.69 99.70 99.79 99.82 99.89
## --------------------------------------------------------------------------------
## Quantity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0       10    0.984    4.277    2.949        1        1 
##      .25      .50      .75      .90      .95 
##        2        4        6        8        9 
## 
## lowest :  1  2  3  4  5, highest:  6  7  8  9 10
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency    112    91    90    96    75    57    41    33    32    31
## Proportion 0.170 0.138 0.137 0.146 0.114 0.087 0.062 0.050 0.049 0.047
## --------------------------------------------------------------------------------
## Tax.5. 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0      650        1    8.197     5.41    1.531    2.352 
##      .25      .50      .75      .90      .95 
##    4.154    7.768   11.998   15.139   16.484 
## 
## lowest :  0.5085  0.6045  0.6270  0.6390  0.6990
## highest: 18.3080 18.6390 19.1300 19.1555 19.4635
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0      650        1    172.1    113.6    32.15    49.39 
##      .25      .50      .75      .90      .95 
##    87.23   163.12   251.96   317.92   346.16 
## 
## lowest :  10.6785  12.6945  13.1670  13.4190  14.6790
## highest: 384.4680 391.4190 401.7300 402.2655 408.7335
## --------------------------------------------------------------------------------
## datetime 
##                   n             missing            distinct                Info 
##                 658                   0                 657                   1 
##                Mean                 Gmd                 .05                 .10 
## 2019-02-15 21:23:32             2505485 2019-01-06 13:22:33 2019-01-11 16:07:00 
##                 .25                 .50                 .75                 .90 
## 2019-01-25 18:47:15 2019-02-15 14:36:00 2019-03-09 12:11:15 2019-03-22 14:12:36 
##                 .95 
## 2019-03-26 14:47:30 
## 
## lowest : 2019-01-01 11:40:00 2019-01-01 11:43:00 2019-01-01 15:51:00 2019-01-01 19:31:00 2019-01-01 20:26:00
## highest: 2019-03-30 12:51:00 2019-03-30 13:22:00 2019-03-30 16:34:00 2019-03-30 17:04:00 2019-03-30 20:37:00
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct 
##      658        0        3 
##                                               
## Value             Cash Credit card     Ewallet
## Frequency          227         207         224
## Proportion       0.345       0.315       0.340
## --------------------------------------------------------------------------------
## cogs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0      650        1    163.9    108.2    30.62    47.04 
##      .25      .50      .75      .90      .95 
##    83.08   155.35   239.96   302.78   329.68 
## 
## lowest :  10.17  12.09  12.54  12.78  13.98, highest: 366.16 372.78 382.60 383.11 389.27
## --------------------------------------------------------------------------------
## Rating 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0       61        1    7.015    1.957    4.285    4.700 
##      .25      .50      .75      .90      .95 
##    5.700    7.000    8.500    9.400    9.700 
## 
## lowest :  4.0  4.1  4.2  4.3  4.4, highest:  9.6  9.7  9.8  9.9 10.0
## --------------------------------------------------------------------------------
## tmed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0       88        1    26.82    2.919    22.20    23.27 
##      .25      .50      .75      .90      .95 
##    25.10    27.00    28.50    29.90    30.60 
## 
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 31.4 31.7 32.8 33.6 34.0
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd 
##      658        0        7    0.979     4.09    2.281 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency     88    83   100   100    87    98   102
## Proportion 0.134 0.126 0.152 0.152 0.132 0.149 0.155
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd 
##      658        0        3    0.887    2.033    0.906 
##                             
## Value          1     2     3
## Frequency    217   202   239
## Proportion 0.330 0.307 0.363
## --------------------------------------------------------------------------------
## week 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0       13    0.994    7.175    4.153        1        2 
##      .25      .50      .75      .90      .95 
##        4        7       10       12       13 
## 
## lowest :  1  2  3  4  5, highest:  9 10 11 12 13
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency     36    45    47    57    50    66    47    45    55    59    53
## Proportion 0.055 0.068 0.071 0.087 0.076 0.100 0.071 0.068 0.084 0.090 0.081
##                       
## Value         12    13
## Frequency     51    47
## Proportion 0.078 0.071
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0       11    0.991     14.9    3.691       10       10 
##      .25      .50      .75      .90      .95 
##       12       15       18       19       20 
## 
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##                                                                             
## Value         10    11    12    13    14    15    16    17    18    19    20
## Frequency     70    57    61    71    44    68    52    49    67    66    53
## Proportion 0.106 0.087 0.093 0.108 0.067 0.103 0.079 0.074 0.102 0.100 0.081
## --------------------------------------------------------------------------------
## daynum 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      658        0       31    0.999     15.3       10     2.00     4.00 
##      .25      .50      .75      .90      .95 
##     7.25    15.00    23.00    27.00    28.15 
## 
## lowest :  1  2  3  4  5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster 
##        n  missing distinct     Info     Mean      Gmd 
##      658        0        1        0        1        0 
##               
## Value        1
## Frequency  658
## Proportion   1
## --------------------------------------------------------------------------------
# Caracteristicas cluster 2
dfkm2 <- filter(dfkm, cluster == 2)
describe(dfkm2)
## dfkm2 
## 
##  19  Variables      342  Observations
## --------------------------------------------------------------------------------
## City 
##        n  missing distinct 
##      342        0        3 
##                                         
## Value       Mandalay Naypyitaw    Yangon
## Frequency        113       119       110
## Proportion     0.330     0.348     0.322
## --------------------------------------------------------------------------------
## Customer.type 
##        n  missing distinct 
##      342        0        2 
##                         
## Value      Member Normal
## Frequency     179    163
## Proportion  0.523  0.477
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      342        0        2 
##                         
## Value      Female   Male
## Frequency     183    159
## Proportion  0.535  0.465
## --------------------------------------------------------------------------------
## Product.line 
##        n  missing distinct 
##      342        0        6 
## 
## lowest : Electronic          Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle     
## highest: Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle      Sports&Travel      
##                                                                       
## Value               Electronic Fashion_accessories      Food&Beverages
## Frequency                   59                  58                  55
## Proportion               0.173               0.170               0.161
##                                                                       
## Value            Health&Beauty      Home&Lifestyle       Sports&Travel
## Frequency                   53                  56                  61
## Proportion               0.155               0.164               0.178
## --------------------------------------------------------------------------------
## Unit.price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0      335        1    75.59    19.66    44.84    51.10 
##      .25      .50      .75      .90      .95 
##    63.45    77.30    90.18    97.26    98.98 
## 
## lowest : 33.21 34.21 35.54 36.98 37.55, highest: 99.78 99.82 99.83 99.92 99.96
## --------------------------------------------------------------------------------
## Quantity 
##        n  missing distinct     Info     Mean      Gmd 
##      342        0        7    0.966    7.883    2.006 
## 
## lowest :  4  5  6  7  8, highest:  6  7  8  9 10
##                                                     
## Value          4     5     6     7     8     9    10
## Frequency     13    27    41    61    52    60    88
## Proportion 0.038 0.079 0.120 0.178 0.152 0.175 0.257
## --------------------------------------------------------------------------------
## Tax.5. 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0      340        1     29.2     9.31    18.86    19.45 
##      .25      .50      .75      .90      .95 
##    22.21    27.52    35.37    40.98    44.60 
## 
## lowest : 16.605 17.105 17.770 17.828 17.829, highest: 48.690 48.750 49.260 49.490 49.650
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0      340        1    613.2    195.5    396.0    408.4 
##      .25      .50      .75      .90      .95 
##    466.4    577.9    742.7    860.7    936.5 
## 
## lowest :  348.705  359.205  373.170  374.388  374.409
## highest: 1022.490 1023.750 1034.460 1039.290 1042.650
## --------------------------------------------------------------------------------
## datetime 
##                   n             missing            distinct                Info 
##                 342                   0                 340                   1 
##                Mean                 Gmd                 .05                 .10 
## 2019-02-12 06:00:35             2495827 2019-01-06 11:27:09 2019-01-10 14:19:18 
##                 .25                 .50                 .75                 .90 
## 2019-01-21 22:14:30 2019-02-09 01:10:00 2019-03-05 17:58:45 2019-03-19 18:56:54 
##                 .95 
## 2019-03-23 19:06:09 
## 
## lowest : 2019-01-01 10:39:00 2019-01-01 11:36:00 2019-01-01 13:55:00 2019-01-01 14:42:00 2019-01-01 14:47:00
## highest: 2019-03-30 10:18:00 2019-03-30 14:43:00 2019-03-30 14:58:00 2019-03-30 17:58:00 2019-03-30 19:26:00
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct 
##      342        0        3 
##                                               
## Value             Cash Credit card     Ewallet
## Frequency          117         104         121
## Proportion       0.342       0.304       0.354
## --------------------------------------------------------------------------------
## cogs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0      340        1      584    186.2    377.1    389.0 
##      .25      .50      .75      .90      .95 
##    444.2    550.4    707.3    819.7    891.9 
## 
## lowest : 332.10 342.10 355.40 356.56 356.58, highest: 973.80 975.00 985.20 989.80 993.00
## --------------------------------------------------------------------------------
## Rating 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0       61        1    6.891    2.037      4.3      4.5 
##      .25      .50      .75      .90      .95 
##      5.3      6.9      8.4      9.3      9.6 
## 
## lowest :  4.0  4.1  4.2  4.3  4.4, highest:  9.6  9.7  9.8  9.9 10.0
## --------------------------------------------------------------------------------
## tmed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0       76    0.999    26.47    2.885    22.10    23.02 
##      .25      .50      .75      .90      .95 
##    24.90    26.50    28.20    29.79    30.29 
## 
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 31.0 31.4 31.7 32.8 34.0
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd 
##      342        0        7    0.978    4.123    2.316 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency     45    42    58    43    51    41    62
## Proportion 0.132 0.123 0.170 0.126 0.149 0.120 0.181
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd 
##      342        0        3    0.883    1.915   0.9083 
##                             
## Value          1     2     3
## Frequency    135   101   106
## Proportion 0.395 0.295 0.310
## --------------------------------------------------------------------------------
## week 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0       13    0.993    6.652    4.091        1        2 
##      .25      .50      .75      .90      .95 
##        4        6       10       12       12 
## 
## lowest :  1  2  3  4  5, highest:  9 10 11 12 13
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency     19    28    35    36    33    26    25    15    32    29    25
## Proportion 0.056 0.082 0.102 0.105 0.096 0.076 0.073 0.044 0.094 0.085 0.073
##                       
## Value         12    13
## Frequency     25    14
## Proportion 0.073 0.041
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0       11     0.99    14.93    3.622       10       11 
##      .25      .50      .75      .90      .95 
##       12       15       18       19       20 
## 
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##                                                                             
## Value         10    11    12    13    14    15    16    17    18    19    20
## Frequency     31    33    28    32    39    34    25    25    26    47    22
## Proportion 0.091 0.096 0.082 0.094 0.114 0.099 0.073 0.073 0.076 0.137 0.064
## --------------------------------------------------------------------------------
## daynum 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      342        0       31    0.999    15.17    10.12        2        3 
##      .25      .50      .75      .90      .95 
##        8       15       23       27       29 
## 
## lowest :  1  2  3  4  5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster 
##        n  missing distinct     Info     Mean      Gmd 
##      342        0        1        0        2        0 
##               
## Value        2
## Frequency  342
## Proportion   1
## --------------------------------------------------------------------------------

7.3 Método con PAM (Partitioning Around Medoids)

#CLUSTER PAM
#con 2 k
set.seed(20)
pam.res <- pam(resultsca, 2)
resultadopam <- as.data.frame(pam.res$clustering)
table(resultadopam)
## resultadopam
##   1   2 
## 670 330
#Visualizing PAM clusters con fviz_cluster()
 fviz_cluster(pam.res)

# Change the color palette and theme
fviz_cluster(pam.res, resultsca,
   palette = "Set2", ggtheme = theme_minimal())

#Para añadir la columna al dataset
supermarket$cluster <- pam.res$clustering

df <- supermarket %>% group_by(cluster) %>%
  summarise(mean=mean(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
dfpam <- supermarket
table(dfpam$cluster)
## 
##   1   2 
## 670 330
# Caracteristicas cluster 1
dfpam1 <- filter(dfpam, cluster == 1)
describe(dfpam1)
## dfpam1 
## 
##  19  Variables      670  Observations
## --------------------------------------------------------------------------------
## City 
##        n  missing distinct 
##      670        0        3 
##                                         
## Value       Mandalay Naypyitaw    Yangon
## Frequency        261       250       159
## Proportion     0.390     0.373     0.237
## --------------------------------------------------------------------------------
## Customer.type 
##        n  missing distinct 
##      670        0        2 
##                         
## Value      Member Normal
## Frequency     281    389
## Proportion  0.419  0.581
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      670        0        2 
##                         
## Value      Female   Male
## Frequency     277    393
## Proportion  0.413  0.587
## --------------------------------------------------------------------------------
## Product.line 
##        n  missing distinct 
##      670        0        6 
## 
## lowest : Electronic          Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle     
## highest: Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle      Sports&Travel      
##                                                                       
## Value               Electronic Fashion_accessories      Food&Beverages
## Frequency                  111                 127                 114
## Proportion               0.166               0.190               0.170
##                                                                       
## Value            Health&Beauty      Home&Lifestyle       Sports&Travel
## Frequency                  110                  95                 113
## Proportion               0.164               0.142               0.169
## --------------------------------------------------------------------------------
## Unit.price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0      644        1    50.81    29.85    14.29    17.48 
##      .25      .50      .75      .90      .95 
##    27.10    48.50    72.55    88.66    95.52 
## 
## lowest : 10.17 10.18 10.53 10.56 10.59, highest: 99.73 99.78 99.79 99.82 99.96
## --------------------------------------------------------------------------------
## Quantity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0       10    0.989    5.106    3.364      1.0      1.0 
##      .25      .50      .75      .90      .95 
##      3.0      5.0      7.0      9.1     10.0 
## 
## lowest :  1  2  3  4  5, highest:  6  7  8  9 10
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency     96    71    66    76    67    63    67    44    53    67
## Proportion 0.143 0.106 0.099 0.113 0.100 0.094 0.100 0.066 0.079 0.100
## --------------------------------------------------------------------------------
## Tax.5. 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0      667        1    12.69    10.99    1.567    2.613 
##      .25      .50      .75      .90      .95 
##    4.684    9.556   17.954   27.446   34.043 
## 
## lowest :  0.5085  0.6045  0.6270  0.6390  0.6990
## highest: 47.7200 47.7900 48.6850 48.7500 49.4900
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0      667        1    266.6    230.8    32.90    54.86 
##      .25      .50      .75      .90      .95 
##    98.37   200.67   377.03   576.36   714.91 
## 
## lowest :   10.6785   12.6945   13.1670   13.4190   14.6790
## highest: 1002.1200 1003.5900 1022.3850 1023.7500 1039.2900
## --------------------------------------------------------------------------------
## datetime 
##                   n             missing            distinct                Info 
##                 670                   0                 668                   1 
##                Mean                 Gmd                 .05                 .10 
## 2019-02-09 07:42:43             2381919 2019-01-05 11:40:33 2019-01-08 14:29:30 
##                 .25                 .50                 .75                 .90 
## 2019-01-19 12:46:15 2019-02-07 11:59:30 2019-03-02 19:43:30 2019-03-14 11:08:24 
##                 .95 
## 2019-03-19 15:01:24 
## 
## lowest : 2019-01-01 10:39:00 2019-01-01 11:36:00 2019-01-01 11:40:00 2019-01-01 11:43:00 2019-01-01 13:55:00
## highest: 2019-03-29 14:28:00 2019-03-29 14:44:00 2019-03-30 12:51:00 2019-03-30 14:58:00 2019-03-30 20:37:00
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct 
##      670        0        3 
##                                               
## Value             Cash Credit card     Ewallet
## Frequency          263         235         172
## Proportion       0.393       0.351       0.257
## --------------------------------------------------------------------------------
## cogs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0      667        1    253.9    219.8    31.34    52.25 
##      .25      .50      .75      .90      .95 
##    93.69   191.11   359.07   548.92   680.87 
## 
## lowest :  10.17  12.09  12.54  12.78  13.98, highest: 954.40 955.80 973.70 975.00 989.80
## --------------------------------------------------------------------------------
## Rating 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0       61        1    7.112    1.957    4.300    4.800 
##      .25      .50      .75      .90      .95 
##    5.725    7.100    8.600    9.500    9.700 
## 
## lowest :  4.0  4.1  4.2  4.3  4.4, highest:  9.6  9.7  9.8  9.9 10.0
## --------------------------------------------------------------------------------
## tmed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0       81    0.999    26.05    2.851    22.00    22.70 
##      .25      .50      .75      .90      .95 
##    24.50    26.00    27.80    29.12    30.00 
## 
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 30.9 31.0 31.4 32.8 34.0
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd 
##      670        0        7    0.977    4.367    2.245 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency     67    75   100    96    95   109   128
## Proportion 0.100 0.112 0.149 0.143 0.142 0.163 0.191
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd 
##      670        0        3     0.88    1.881   0.8925 
##                             
## Value          1     2     3
## Frequency    272   206   192
## Proportion 0.406 0.307 0.287
## --------------------------------------------------------------------------------
## week 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0       13    0.992     6.23    3.919        1        2 
##      .25      .50      .75      .90      .95 
##        3        6        9       11       12 
## 
## lowest :  1  2  3  4  5, highest:  9 10 11 12 13
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency     51    66    64    60    62    76    48    33    54    64    52
## Proportion 0.076 0.099 0.096 0.090 0.093 0.113 0.072 0.049 0.081 0.096 0.078
##                       
## Value         12    13
## Frequency     25    15
## Proportion 0.037 0.022
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0       11    0.991    14.98    3.698     10.0     10.9 
##      .25      .50      .75      .90      .95 
##     12.0     15.0     18.0     19.0     20.0 
## 
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##                                                                             
## Value         10    11    12    13    14    15    16    17    18    19    20
## Frequency     67    56    67    65    50    68    50    50    69    71    57
## Proportion 0.100 0.084 0.100 0.097 0.075 0.101 0.075 0.075 0.103 0.106 0.085
## --------------------------------------------------------------------------------
## daynum 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      670        0       31    0.998    13.24    9.624        2        3 
##      .25      .50      .75      .90      .95 
##        6       12       20       26       28 
## 
## lowest :  1  2  3  4  5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster 
##        n  missing distinct     Info     Mean      Gmd 
##      670        0        1        0        1        0 
##               
## Value        1
## Frequency  670
## Proportion   1
## --------------------------------------------------------------------------------
# Caracteristicas cluster 2
dfpam2 <- filter(dfpam, cluster == 2)
describe(dfpam2)
## dfpam2 
## 
##  19  Variables      330  Observations
## --------------------------------------------------------------------------------
## City 
##        n  missing distinct 
##      330        0        3 
##                                         
## Value       Mandalay Naypyitaw    Yangon
## Frequency         71        78       181
## Proportion     0.215     0.236     0.548
## --------------------------------------------------------------------------------
## Customer.type 
##        n  missing distinct 
##      330        0        2 
##                         
## Value      Member Normal
## Frequency     220    110
## Proportion  0.667  0.333
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      330        0        2 
##                         
## Value      Female   Male
## Frequency     224    106
## Proportion  0.679  0.321
## --------------------------------------------------------------------------------
## Product.line 
##        n  missing distinct 
##      330        0        6 
## 
## lowest : Electronic          Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle     
## highest: Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle      Sports&Travel      
##                                                                       
## Value               Electronic Fashion_accessories      Food&Beverages
## Frequency                   59                  51                  60
## Proportion               0.179               0.155               0.182
##                                                                       
## Value            Health&Beauty      Home&Lifestyle       Sports&Travel
## Frequency                   42                  65                  53
## Proportion               0.127               0.197               0.161
## --------------------------------------------------------------------------------
## Unit.price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0      319        1    65.54    28.36    20.52    28.28 
##      .25      .50      .75      .90      .95 
##    47.67    71.54    87.59    96.03    98.70 
## 
## lowest : 10.08 10.13 10.16 10.69 12.03, highest: 99.71 99.82 99.83 99.89 99.92
## --------------------------------------------------------------------------------
## Quantity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0       10    0.987     6.33    3.118        2        2 
##      .25      .50      .75      .90      .95 
##        4        7        9       10       10 
## 
## lowest :  1  2  3  4  5, highest:  6  7  8  9 10
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency     16    20    24    33    35    35    35    41    39    52
## Proportion 0.048 0.061 0.073 0.100 0.106 0.106 0.106 0.124 0.118 0.158
## --------------------------------------------------------------------------------
## Tax.5. 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0      328        1    20.83    14.29    3.668    4.579 
##      .25      .50      .75      .90      .95 
##    9.966   19.519   30.741   38.474   41.847 
## 
## lowest :  1.2030  1.2645  1.9300  1.9560  2.1480
## highest: 45.3250 48.6050 48.6900 49.2600 49.6500
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0      328        1    437.5    300.1    77.04    96.16 
##      .25      .50      .75      .90      .95 
##   209.28   409.89   645.57   807.95   878.78 
## 
## lowest :   25.2630   26.5545   40.5300   41.0760   45.1080
## highest:  951.8250 1020.7050 1022.4900 1034.4600 1042.6500
## --------------------------------------------------------------------------------
## datetime 
##                   n             missing            distinct                Info 
##                 330                   0                 329                   1 
##                Mean                 Gmd                 .05                 .10 
## 2019-02-25 10:58:17             2378492 2019-01-17 02:32:54 2019-01-22 10:56:48 
##                 .25                 .50                 .75                 .90 
## 2019-02-03 22:23:15 2019-02-26 17:19:00 2019-03-20 07:19:30 2019-03-26 14:39:24 
##                 .95 
## 2019-03-28 15:30:33 
## 
## lowest : 2019-01-01 14:42:00 2019-01-04 13:34:00 2019-01-05 13:08:00 2019-01-06 13:58:00 2019-01-07 15:01:00
## highest: 2019-03-30 14:43:00 2019-03-30 16:34:00 2019-03-30 17:04:00 2019-03-30 17:58:00 2019-03-30 19:26:00
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct 
##      330        0        3 
##                                               
## Value             Cash Credit card     Ewallet
## Frequency           81          76         173
## Proportion       0.245       0.230       0.524
## --------------------------------------------------------------------------------
## cogs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0      328        1    416.7    285.8    73.37    91.58 
##      .25      .50      .75      .90      .95 
##   199.31   390.38   614.83   769.48   836.94 
## 
## lowest :  24.06  25.29  38.60  39.12  42.96, highest: 906.50 972.10 973.80 985.20 993.00
## --------------------------------------------------------------------------------
## Rating 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0       60        1    6.689    1.997      4.2      4.4 
##      .25      .50      .75      .90      .95 
##      5.1      6.6      8.0      9.2      9.6 
## 
## lowest :  4.0  4.1  4.2  4.3  4.4, highest:  9.6  9.7  9.8  9.9 10.0
## --------------------------------------------------------------------------------
## tmed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0       69    0.999       28    2.432    24.23    25.28 
##      .25      .50      .75      .90      .95 
##    26.50    28.00    29.50    30.50    31.00 
## 
## lowest : 21.6 22.1 22.2 22.7 23.2, highest: 31.4 31.7 32.8 33.6 34.0
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd 
##      330        0        7    0.976    3.561    2.238 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency     66    50    58    47    43    30    36
## Proportion 0.200 0.152 0.176 0.142 0.130 0.091 0.109
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd 
##      330        0        3    0.861    2.221   0.8673 
##                             
## Value          1     2     3
## Frequency     80    97   153
## Proportion 0.242 0.294 0.464
## --------------------------------------------------------------------------------
## week 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0       13    0.989    8.552    3.942     3.00     4.00 
##      .25      .50      .75      .90      .95 
##     5.25     9.00    12.00    13.00    13.00 
## 
## lowest :  1  2  3  4  5, highest:  9 10 11 12 13
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency      4     7    18    33    21    16    24    27    33    24    26
## Proportion 0.012 0.021 0.055 0.100 0.064 0.048 0.073 0.082 0.100 0.073 0.079
##                       
## Value         12    13
## Frequency     51    46
## Proportion 0.155 0.139
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0       11     0.99    14.77    3.598       10       10 
##      .25      .50      .75      .90      .95 
##       12       15       18       19       20 
## 
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##                                                                             
## Value         10    11    12    13    14    15    16    17    18    19    20
## Frequency     34    34    22    38    33    34    27    24    24    42    18
## Proportion 0.103 0.103 0.067 0.115 0.100 0.103 0.082 0.073 0.073 0.127 0.055
## --------------------------------------------------------------------------------
## daynum 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      330        0       31    0.998    19.35    8.757        4        7 
##      .25      .50      .75      .90      .95 
##       14       21       26       28       29 
## 
## lowest :  1  2  3  4  5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster 
##        n  missing distinct     Info     Mean      Gmd 
##      330        0        1        0        2        0 
##               
## Value        2
## Frequency  330
## Proportion   1
## --------------------------------------------------------------------------------

7.4 Método con Hierarchical Cluster Analysis

set.seed(20)
#CON HCLUST
# Dissimilarity matrix, halla los valores de la distancia,necesario para hclust
d <- dist(resultsca, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D" )

# Dibujamos el dendograma con los 2 cluster
plot(hc1, cex = 0.6, hang = -1)
rect.hclust(hc1, k = 2, border = 2:5)

#Dibujo
fviz_dend(x = hc1, k = 2, cex = 0.6) +
  geom_hline(yintercept = 250, linetype = "dashed") +
  labs(title = "Hierarchical clustering",
       subtitle = "Method Ward.D, K=2")
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

# Cortamos el dendograma en 2 grupos
sub_grp <- cutree(hc1, k = 2)
table(sub_grp)
## sub_grp
##   1   2 
## 800 200
# Añadimos resultados del cluster al df
dfh <- supermarket
dfh$cluster <- sub_grp

df <- dfh %>% group_by(cluster) %>%
  summarise(mean=mean(Total))
## `summarise()` ungrouping output (override with `.groups` argument)
# Dibujamos los cluster
fviz_cluster(list(data = resultsca, cluster = sub_grp))

saveRDS(dfh, file = "dfh.rds")



# Caracteristicas cluster 1
dfh1 <- filter(dfh, cluster == 1)
describe(dfh1)
## dfh1 
## 
##  19  Variables      800  Observations
## --------------------------------------------------------------------------------
## City 
##        n  missing distinct 
##      800        0        3 
##                                         
## Value       Mandalay Naypyitaw    Yangon
## Frequency        267       250       283
## Proportion     0.334     0.312     0.354
## --------------------------------------------------------------------------------
## Customer.type 
##        n  missing distinct 
##      800        0        2 
##                         
## Value      Member Normal
## Frequency     394    406
## Proportion  0.492  0.507
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      800        0        2 
##                         
## Value      Female   Male
## Frequency     396    404
## Proportion  0.495  0.505
## --------------------------------------------------------------------------------
## Product.line 
##        n  missing distinct 
##      800        0        6 
## 
## lowest : Electronic          Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle     
## highest: Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle      Sports&Travel      
##                                                                       
## Value               Electronic Fashion_accessories      Food&Beverages
## Frequency                  132                 148                 151
## Proportion               0.165               0.185               0.189
##                                                                       
## Value            Health&Beauty      Home&Lifestyle       Sports&Travel
## Frequency                  127                 130                 112
## Proportion               0.159               0.162               0.140
## --------------------------------------------------------------------------------
## Unit.price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0      760        1     50.1    29.42    14.35    17.62 
##      .25      .50      .75      .90      .95 
##    27.27    47.33    71.88    88.17    94.67 
## 
## lowest : 10.08 10.13 10.16 10.17 10.18, highest: 99.79 99.82 99.83 99.89 99.92
## --------------------------------------------------------------------------------
## Quantity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0       10    0.988    4.779    3.109        1        1 
##      .25      .50      .75      .90      .95 
##        2        5        7        9       10 
## 
## lowest :  1  2  3  4  5, highest:  6  7  8  9 10
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency    112    91    90   105    99    84    65    53    49    52
## Proportion 0.140 0.114 0.112 0.131 0.124 0.105 0.081 0.066 0.061 0.065
## --------------------------------------------------------------------------------
## Tax.5. 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0      792        1    11.06    8.524    1.647    2.671 
##      .25      .50      .75      .90      .95 
##    4.653    9.290   15.572   22.722   25.577 
## 
## lowest :  0.5085  0.6045  0.6270  0.6390  0.6990
## highest: 35.6900 40.9750 41.2900 43.7490 43.9350
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0      792        1    232.3      179    34.59    56.10 
##      .25      .50      .75      .90      .95 
##    97.72   195.08   327.01   477.17   537.11 
## 
## lowest :  10.6785  12.6945  13.1670  13.4190  14.6790
## highest: 749.4900 860.4750 867.0900 918.7290 922.6350
## --------------------------------------------------------------------------------
## datetime 
##                   n             missing            distinct                Info 
##                 800                   0                 798                   1 
##                Mean                 Gmd                 .05                 .10 
## 2019-02-15 20:32:58             2542407 2019-01-06 12:10:57 2019-01-10 17:15:54 
##                 .25                 .50                 .75                 .90 
## 2019-01-25 14:52:15 2019-02-15 17:05:30 2019-03-09 23:26:00 2019-03-22 19:07:24 
##                 .95 
## 2019-03-26 19:28:48 
## 
## lowest : 2019-01-01 11:40:00 2019-01-01 11:43:00 2019-01-01 13:55:00 2019-01-01 15:51:00 2019-01-01 19:07:00
## highest: 2019-03-30 16:34:00 2019-03-30 17:04:00 2019-03-30 17:58:00 2019-03-30 19:26:00 2019-03-30 20:37:00
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct 
##      800        0        3 
##                                               
## Value             Cash Credit card     Ewallet
## Frequency          274         245         281
## Proportion       0.342       0.306       0.351
## --------------------------------------------------------------------------------
## cogs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0      792        1    221.2    170.5    32.95    53.43 
##      .25      .50      .75      .90      .95 
##    93.06   185.79   311.44   454.45   511.53 
## 
## lowest :  10.17  12.09  12.54  12.78  13.98, highest: 713.80 819.50 825.80 874.98 878.70
## --------------------------------------------------------------------------------
## Rating 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0       61        1    7.014    1.974      4.3      4.6 
##      .25      .50      .75      .90      .95 
##      5.6      7.0      8.5      9.4      9.7 
## 
## lowest :  4.0  4.1  4.2  4.3  4.4, highest:  9.6  9.7  9.8  9.9 10.0
## --------------------------------------------------------------------------------
## tmed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0       88        1     26.8    2.948     22.2     23.2 
##      .25      .50      .75      .90      .95 
##     25.1     27.0     28.5     30.0     30.6 
## 
## lowest : 19.8 20.1 20.5 21.2 21.3, highest: 31.4 31.7 32.8 33.6 34.0
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd 
##      800        0        7    0.979    4.093    2.286 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency    105   104   123   118   109   112   129
## Proportion 0.131 0.130 0.154 0.148 0.136 0.140 0.161
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd 
##      800        0        3    0.887     2.03   0.9117 
##                             
## Value          1     2     3
## Frequency    269   238   293
## Proportion 0.336 0.298 0.366
## --------------------------------------------------------------------------------
## week 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0       13    0.994    7.171    4.199        1        2 
##      .25      .50      .75      .90      .95 
##        4        7       10       12       13 
## 
## lowest :  1  2  3  4  5, highest:  9 10 11 12 13
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency     44    58    58    74    58    75    54    50    68    73    67
## Proportion 0.055 0.072 0.072 0.092 0.072 0.094 0.068 0.062 0.085 0.091 0.084
##                       
## Value         12    13
## Frequency     63    58
## Proportion 0.079 0.072
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0       11    0.991    15.01    3.646       10       11 
##      .25      .50      .75      .90      .95 
##       12       15       18       19       20 
## 
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##                                                                             
## Value         10    11    12    13    14    15    16    17    18    19    20
## Frequency     76    66    73    83    64    80    64    62    80    90    62
## Proportion 0.095 0.082 0.091 0.104 0.080 0.100 0.080 0.078 0.100 0.112 0.078
## --------------------------------------------------------------------------------
## daynum 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      800        0       31    0.999    15.38    10.06        2        4 
##      .25      .50      .75      .90      .95 
##        8       15       23       27       29 
## 
## lowest :  1  2  3  4  5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster 
##        n  missing distinct     Info     Mean      Gmd 
##      800        0        1        0        1        0 
##               
## Value        1
## Frequency  800
## Proportion   1
## --------------------------------------------------------------------------------
# Caracteristicas cluster 2
dfh2 <- filter(dfh, cluster == 2)
describe(dfh2)
## dfh2 
## 
##  19  Variables      200  Observations
## --------------------------------------------------------------------------------
## City 
##        n  missing distinct 
##      200        0        3 
##                                         
## Value       Mandalay Naypyitaw    Yangon
## Frequency         65        78        57
## Proportion     0.325     0.390     0.285
## --------------------------------------------------------------------------------
## Customer.type 
##        n  missing distinct 
##      200        0        2 
##                         
## Value      Member Normal
## Frequency     107     93
## Proportion  0.535  0.465
## --------------------------------------------------------------------------------
## Gender 
##        n  missing distinct 
##      200        0        2 
##                         
## Value      Female   Male
## Frequency     105     95
## Proportion  0.525  0.475
## --------------------------------------------------------------------------------
## Product.line 
##        n  missing distinct 
##      200        0        6 
## 
## lowest : Electronic          Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle     
## highest: Fashion_accessories Food&Beverages      Health&Beauty       Home&Lifestyle      Sports&Travel      
##                                                                       
## Value               Electronic Fashion_accessories      Food&Beverages
## Frequency                   38                  30                  23
## Proportion               0.190               0.150               0.115
##                                                                       
## Value            Health&Beauty      Home&Lifestyle       Sports&Travel
## Frequency                   25                  30                  54
## Proportion               0.125               0.150               0.270
## --------------------------------------------------------------------------------
## Unit.price 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0      197        1    77.96    18.42    47.63    55.66 
##      .25      .50      .75      .90      .95 
##    66.61    79.90    90.71    97.53    99.10 
## 
## lowest : 21.43 31.99 37.32 43.13 44.02, highest: 99.55 99.56 99.73 99.82 99.96
## --------------------------------------------------------------------------------
## Quantity 
##        n  missing distinct     Info     Mean      Gmd 
##      200        0        7    0.942    8.435    1.655 
## 
## lowest :  4  5  6  7  8, highest:  6  7  8  9 10
##                                                     
## Value          4     5     6     7     8     9    10
## Frequency      4     3    14    37    32    43    67
## Proportion 0.020 0.015 0.070 0.185 0.160 0.215 0.335
## --------------------------------------------------------------------------------
## Tax.5. 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0      198        1    32.65    9.665    19.02    21.24 
##      .25      .50      .75      .90      .95 
##    26.68    33.66    38.65    44.34    45.25 
## 
## lowest : 10.7150 13.4520 15.9530 15.9950 16.2425
## highest: 48.6900 48.7500 49.2600 49.4900 49.6500
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0      198        1    685.6      203    399.3    446.0 
##      .25      .50      .75      .90      .95 
##    560.2    706.9    811.5    931.2    950.3 
## 
## lowest :  225.0150  282.4920  335.0130  335.8950  341.0925
## highest: 1022.4900 1023.7500 1034.4600 1039.2900 1042.6500
## --------------------------------------------------------------------------------
## datetime 
##                   n             missing            distinct                Info 
##                 200                   0                 198                   1 
##                Mean                 Gmd                 .05                 .10 
## 2019-02-09 19:20:24             2300585 2019-01-06 12:41:51 2019-01-12 09:03:42 
##                 .25                 .50                 .75                 .90 
## 2019-01-21 09:00:15 2019-02-07 12:57:30 2019-03-01 23:42:15 2019-03-14 22:24:18 
##                 .95 
## 2019-03-20 11:46:48 
## 
## lowest : 2019-01-01 10:39:00 2019-01-01 11:36:00 2019-01-01 14:42:00 2019-01-01 14:47:00 2019-01-03 19:08:00
## highest: 2019-03-23 13:23:00 2019-03-24 18:27:00 2019-03-25 18:30:00 2019-03-27 10:43:00 2019-03-29 19:12:00
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct 
##      200        0        3 
##                                               
## Value             Cash Credit card     Ewallet
## Frequency           70          66          64
## Proportion        0.35        0.33        0.32
## --------------------------------------------------------------------------------
## cogs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0      198        1      653    193.3    380.3    424.7 
##      .25      .50      .75      .90      .95 
##    533.6    673.3    772.9    886.8    905.1 
## 
## lowest : 214.30 269.04 319.06 319.90 324.85, highest: 973.80 975.00 985.20 989.80 993.00
## --------------------------------------------------------------------------------
## Rating 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       59        1    6.809    2.023    4.200    4.490 
##      .25      .50      .75      .90      .95 
##    5.300    6.650    8.300    9.310    9.605 
## 
## lowest :  4.0  4.1  4.2  4.3  4.4, highest:  9.6  9.7  9.8  9.9 10.0
## --------------------------------------------------------------------------------
## tmed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       60    0.999    26.28    2.712     22.1     22.9 
##      .25      .50      .75      .90      .95 
##     24.9     26.4     27.7     29.5     30.0 
## 
## lowest : 20.1 20.5 21.2 21.3 21.6, highest: 30.0 30.1 30.5 31.4 34.0
## --------------------------------------------------------------------------------
## day 
##        n  missing distinct     Info     Mean      Gmd 
##      200        0        7    0.978    4.135    2.322 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency     28    21    35    25    29    27    35
## Proportion 0.140 0.105 0.175 0.125 0.145 0.135 0.175
## --------------------------------------------------------------------------------
## month 
##        n  missing distinct     Info     Mean      Gmd 
##      200        0        3    0.877    1.845   0.8747 
##                             
## Value          1     2     3
## Frequency     83    65    52
## Proportion 0.415 0.325 0.260
## --------------------------------------------------------------------------------
## week 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       13    0.992    6.295    3.792     1.00     2.00 
##      .25      .50      .75      .90      .95 
##     3.75     6.00     9.00    11.00    12.00 
## 
## lowest :  1  2  3  4  5, highest:  9 10 11 12 13
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency     11    15    24    19    25    17    18    10    19    15    11
## Proportion 0.055 0.075 0.120 0.095 0.125 0.085 0.090 0.050 0.095 0.075 0.055
##                       
## Value         12    13
## Frequency     13     3
## Proportion 0.065 0.015
## --------------------------------------------------------------------------------
## hour 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       11     0.99    14.53    3.711       10       10 
##      .25      .50      .75      .90      .95 
##       12       14       17       19       20 
## 
## lowest : 10 11 12 13 14, highest: 16 17 18 19 20
##                                                                             
## Value         10    11    12    13    14    15    16    17    18    19    20
## Frequency     25    24    16    20    19    22    13    12    13    23    13
## Proportion 0.125 0.120 0.080 0.100 0.095 0.110 0.065 0.060 0.065 0.115 0.065
## --------------------------------------------------------------------------------
## daynum 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      200        0       31    0.998    14.77    9.937     2.00     3.00 
##      .25      .50      .75      .90      .95 
##     8.00    15.00    22.25    27.00    28.00 
## 
## lowest :  1  2  3  4  5, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## cluster 
##        n  missing distinct     Info     Mean      Gmd 
##      200        0        1        0        2        0 
##               
## Value        2
## Frequency  200
## Proportion   1
## --------------------------------------------------------------------------------

8. Predicción de los grupos del cluster

8.1 Feature Engineering

supermarket <- as.data.table(supermarket)


#Creamos variables con las frecuencias
supermarket[ , fe_city := .N , by = .(City)]
supermarket[ , fe_gender := .N , by = .(Gender)]
supermarket[ , fe_customer := .N , by = .(Customer.type)]
supermarket[ , fe_product := .N , by = .(Product.line)]
supermarket[ , fe_payment := .N , by = .(Payment)]


#Creamos variable long y latitud
# Mandalay long 21.959433, latitud 96.101045
# Naypitaw 19.740465, 96.090555
# Yangon 16.877067, 96.177399

supermarket$longitude = ifelse(supermarket$City == "Mandalay", 21.959433, ifelse(supermarket$City == "Naypyitaw", 19.740465, 16.877067))

supermarket$latitude = ifelse(supermarket$City == "Mandalay", 96.101045, ifelse(supermarket$City == "Naypyitaw", 96.090555, 96.177399))

#creamos variable lonlat
supermarket$fe_lonlat  <- sqrt(supermarket$longitude^2 + supermarket$latitude^2)

#convertimos a dummies
supermarket1 <- fastDummies::dummy_cols(supermarket,remove_first_dummy = TRUE)

supermarket1[,1:4] <- NULL
supermarket1$Payment <- NULL
supermarket1$datetime <- NULL

#NORMALIZAR LOS DATOS

names (supermarket1)[1] = "UnitPrice"
names (supermarket1)[3] = "Tax5"
names (supermarket1)[21] = "CityNaypyitaw"
names (supermarket1)[22] = "CityYangon"
names (supermarket1)[23] = "CustomerTypeNormal"
names (supermarket1)[24] = "GenderMale"
names (supermarket1)[25] = "PLFashionAccessories"
names (supermarket1)[26] = "PLFoodBeverages"
names (supermarket1)[27] = "PLHealthBeauty"
names (supermarket1)[28] = "PLHomeLifestyle"
names (supermarket1)[29] = "PLSportsTravel"
names (supermarket1)[30] = "PaymentCreditCard"
names (supermarket1)[31] = "PaymentEwallet"


supersca <- scale(supermarket1)
supersca <- as.data.frame(supersca)

saveRDS(supersca, file = "supersca.rds")


supersca$cluster <- dfpam$cluster

#semilla para obtener mismos valores
set.seed(1234)
validationIndex <- createDataPartition(supersca$cluster, p=0.70, list=FALSE)

# Para validar
my_test1  <- supersca[-validationIndex,]
saveRDS(my_test1, file = "my_test1.rds")

# Para entrenar
my_train1 <- supersca[validationIndex,]
saveRDS(my_train1, file = "my_train1.rds")

prop.table(table(supersca$cluster))
## 
##    1    2 
## 0.67 0.33
prop.table(table(my_test1$cluster))
## 
##         1         2 
## 0.7233333 0.2766667
prop.table(table(my_train1$cluster))
## 
##         1         2 
## 0.6471429 0.3528571

8.2 K-Nearest-Neighbors

#EJEMPLO KNN (K-Nearest-Neighbors)

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))

set.seed(1234)
knnpred <- knn(train = my_train1[,-32], 
                       cl = my_train1[, 32], 
                       test = my_test1[,-32], 
                       k = 2)

# Porcentaje de acierto 
PredKNN = mean(knnpred == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",PredKNN ))
## [1] "Porcentaje de acierto/Accuracy: 0.77"
cfm <- as.data.frame(table(Pred = knnpred, Obj = my_test1$cluster))

# Errores de clasificacion
errclasKNN <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasKNN ))
## [1] "Errores de clasificación: 69"
# Matriz de confusion

plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

pred1 <- prediction(as.numeric(knnpred), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.729415357281661"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.855769230769231"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.576086956521739"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.820276497695853"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.63855421686747"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.83764705882353"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.605714285714286"
RESULTADOS <-data.frame(modelo = ('knn'),
                        accuracy =c(PredKNN),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasKNN)
                        )

8.3 Support Vector Machines

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))

my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)

set.seed(1234)
modeloSVM = train(form = cluster ~ ., data = my_train1, method = 'svmRadial')
modeloSVM$bestTune
##        sigma C
## 3 0.01849074 1
pred_valid_SVM = predict(modeloSVM, newdata = my_test1[,-32])


#MATRIZ CONFUSION
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_SVM))
cfm
##   Obj Pred Freq
## 1   1    1  208
## 2   2    1    8
## 3   1    2    9
## 4   2    2   75
# Errores de validacion
errclasSVM <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasSVM ))
## [1] "Errores de clasificación: 17"
#MATRIZ CONFUSION
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

#Porcentaje de acierto
predSVM = mean(pred_valid_SVM == my_test1$cluster)
predSVM
## [1] 0.9433333
print(paste0("Porcentaje de acierto/Accuracy: ",predSVM ))
## [1] "Porcentaje de acierto/Accuracy: 0.943333333333333"
pred1 <- prediction(as.numeric(pred_valid_SVM), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.931069901726723"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.95852534562212"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.903614457831325"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.962962962962963"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.892857142857143"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.960739030023095"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.898203592814371"
z <-data.frame(modelo = ('svm'),
                        accuracy =c(predSVM),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasSVM)
                        )
RESULTADOS <-rbind(RESULTADOS,z)

8.4 Generalized Linear Model

*Regresión

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))

library(stats)

#eliminamos variables correlacionadas
modeloGLM <- glm(as.factor(cluster) ~ ., family = binomial, data = my_train1[,-4:-5])
summary(modeloGLM) 
## 
## Call:
## glm(formula = as.factor(cluster) ~ ., family = binomial, data = my_train1[, 
##     -4:-5])
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.842e-04  -2.100e-08  -2.100e-08   2.100e-08   2.849e-04  
## 
## Coefficients: (8 not defined because of singularities)
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)            -183.55   12864.02  -0.014    0.989
## UnitPrice                82.84   11570.41   0.007    0.994
## Quantity                 30.90   14973.49   0.002    0.998
## Tax5                     96.05   14920.70   0.006    0.995
## Rating                  -55.64   17530.29  -0.003    0.997
## tmed                     93.73    9478.79   0.010    0.992
## day                    -110.15   12054.81  -0.009    0.993
## month                   -11.30  111681.39   0.000    1.000
## week                     74.52  108682.13   0.001    0.999
## hour                    -19.80   17469.90  -0.001    0.999
## daynum                  151.38   40764.04   0.004    0.997
## fe_city                  86.49    9078.32   0.010    0.992
## fe_gender               145.95   10975.73   0.013    0.989
## fe_customer             141.36    9671.00   0.015    0.988
## fe_product              -51.63   48966.42  -0.001    0.999
## fe_payment             4935.14  327392.85   0.015    0.988
## longitude               -83.47   10544.28  -0.008    0.994
## latitude                    NA         NA      NA       NA
## fe_lonlat                   NA         NA      NA       NA
## CityNaypyitaw               NA         NA      NA       NA
## CityYangon                  NA         NA      NA       NA
## CustomerTypeNormal          NA         NA      NA       NA
## GenderMale                  NA         NA      NA       NA
## PLFashionAccessories     22.33   29788.73   0.001    0.999
## PLFoodBeverages          12.84   15948.70   0.001    0.999
## PLHealthBeauty          -41.16   34227.62  -0.001    0.999
## PLHomeLifestyle         -24.59   28149.10  -0.001    0.999
## PLSportsTravel              NA         NA      NA       NA
## PaymentCreditCard      4865.16  321386.27   0.015    0.988
## PaymentEwallet              NA         NA      NA       NA
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9.0888e+02  on 699  degrees of freedom
## Residual deviance: 6.0020e-07  on 678  degrees of freedom
## AIC: 44
## 
## Number of Fisher Scoring iterations: 25
#se aprecia el p valor en su ultima columna,  representa la relevancia estadística de la variable independiente como elemento predictivo

pred_valid_GLM <- predict(modeloGLM, type = 'response', newdata = my_test1[,-4:-5])
pred_valid_GLM <- ifelse(pred_valid_GLM > 0.5, 1, 0)
pred_valid_GLM <- factor(pred_valid_GLM, levels = c("0", "1"), labels = c("1", "2"))


cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_GLM))
cfm
##   Obj Pred Freq
## 1   1    1  212
## 2   2    1    1
## 3   1    2    5
## 4   2    2   82
# Errores de clasificacion
errclasGLM <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasGLM ))
## [1] "Errores de clasificación: 6"
#Matriz confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predGLM = mean(pred_valid_GLM == my_test1$cluster)

print(paste0("Porcentaje de acierto/Accuracy: ",predGLM ))
## [1] "Porcentaje de acierto/Accuracy: 0.98"
pred1 <- prediction(as.numeric(pred_valid_GLM), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.982455166287269"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.976958525345622"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.987951807228916"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.995305164319249"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.942528735632184"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.986046511627907"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.964705882352941"
z1 <-data.frame(modelo = ('glm'),
                        accuracy =c(predGLM),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasGLM)
                        )
RESULTADOS <-rbind(RESULTADOS,z1)

8.5 Naive Bayes

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))

my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)


set.seed(1234)
modeloBayes <- naiveBayes(cluster ~ ., data = my_train1)

pred_valid_BAYES <- predict(modeloBayes, newdata = my_test1)

cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_BAYES))
cfm
##   Obj Pred Freq
## 1   1    1  190
## 2   2    1   18
## 3   1    2   27
## 4   2    2   65
# Errores de clasificacion
errclasBAYES <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasBAYES ))
## [1] "Errores de clasificación: 45"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

#ACCURACY
predBAYES = mean(pred_valid_BAYES == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predBAYES ))
## [1] "Porcentaje de acierto/Accuracy: 0.85"
pred1 <- prediction(as.numeric(pred_valid_BAYES), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.829354283493421"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.875576036866359"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.783132530120482"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.913461538461538"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.706521739130435"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.894117647058824"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.742857142857143"
z2 <-data.frame(modelo = ('NaiveBayes'),
                        accuracy =c(predBAYES),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasBAYES)
                        )
RESULTADOS <-rbind(RESULTADOS,z2)

8.6 Arbol de decisión RPART

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))


my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)

#Hallar mejor minsplit que es 5
obj3 <- tune.rpart(cluster~., data = my_train1, minsplit = c(5,10,15))
  summary(obj3)
## 
## Parameter tuning of 'rpart.wrapper':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  minsplit
##         5
## 
## - best performance: 0.22 
## 
## - Detailed performance results:
##   minsplit     error dispersion
## 1        5 0.2200000 0.06324555
## 2       10 0.2200000 0.06324555
## 3       15 0.2228571 0.06179477
  plot(obj3)

set.seed(1234)
modeloDT <- rpart(cluster ~ ., data = my_train1, minsplit = 5)

pred_valid_DT <- predict(modeloDT, newdata = my_test1, type = 'class')

cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_DT))
cfm
##   Obj Pred Freq
## 1   1    1  184
## 2   2    1   28
## 3   1    2   33
## 4   2    2   55
# Errores de clasificacion
errclasDT <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasDT ))#54 errores
## [1] "Errores de clasificación: 61"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predRPART = mean(pred_valid_DT == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predRPART )) 
## [1] "Porcentaje de acierto/Accuracy: 0.796666666666667"
pred1 <- prediction(as.numeric(pred_valid_DT), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.755288434845372"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.847926267281106"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.662650602409639"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.867924528301887"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.625"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.857808857808858"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.64327485380117"
z3 <-data.frame(modelo = ('rpart'),
                        accuracy =c(predRPART),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasDT)
                        )
RESULTADOS <-rbind(RESULTADOS,z3)

8.7 Latent Dirichlet Allocation

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))


lda <- lda(cluster ~ ., data = my_train1)
coef(lda)
##                              LD1
## UnitPrice             0.17194038
## Quantity              0.01959883
## Tax5                  0.17275307
## Total                 0.17275307
## cogs                  0.17275307
## Rating               -0.21915703
## tmed                  0.31981099
## day                  -0.34177656
## month                 0.34371771
## week                 -0.14997366
## hour                 -0.01282634
## daynum                0.70094224
## fe_city               0.11130508
## fe_gender             0.27364556
## fe_customer           0.24515802
## fe_product            0.02305005
## fe_payment            0.00737049
## longitude            -0.09420155
## latitude              0.11462921
## fe_lonlat            -0.08722340
## CityNaypyitaw        -0.06804700
## CityYangon            0.11389982
## CustomerTypeNormal   -0.24515802
## GenderMale           -0.27364556
## PLFashionAccessories -0.09121014
## PLFoodBeverages       0.03124522
## PLHealthBeauty       -0.06811002
## PLHomeLifestyle       0.01224416
## PLSportsTravel       -0.05500083
## PaymentCreditCard     0.01005304
## PaymentEwallet        0.58871422
pred <- predict(lda, my_test1)

pred_valid_LDA <- as.numeric(pred$class)

cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_LDA))
cfm
##   Obj Pred Freq
## 1   1    1  210
## 2   2    1    4
## 3   1    2    7
## 4   2    2   79
# Errores de clasificacion
errclasLDA <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasLDA )) 
## [1] "Errores de clasificación: 11"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predLDA = mean(pred_valid_LDA == my_test1$cluster)
predLDA
## [1] 0.9633333
print(paste0("Porcentaje de acierto/Accuracy: ",predLDA ))
## [1] "Porcentaje de acierto/Accuracy: 0.963333333333333"
pred1 <- prediction(as.numeric(pred_valid_LDA), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.959774582199767"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.967741935483871"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.951807228915663"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.981308411214953"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.918604651162791"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.974477958236659"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.93491124260355"
z4 <-data.frame(modelo = ('lda'),
                        accuracy =c(predLDA),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasLDA)
                        )
RESULTADOS <-rbind(RESULTADOS,z4)

8.8 Adabag Boosting

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))


my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)

library(adabag)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
model = boosting(cluster~., data=my_train1, boos=TRUE, mfinal=50)
print(names(model))
## [1] "formula"    "trees"      "weights"    "votes"      "prob"      
## [6] "class"      "importance" "terms"      "call"
model$importance
##        CityNaypyitaw           CityYangon                 cogs 
##           0.00000000           0.00000000           0.00000000 
##   CustomerTypeNormal                  day               daynum 
##           0.74919941           5.57559368          13.03371994 
##              fe_city          fe_customer            fe_gender 
##           5.65445964           5.99890496           5.76982886 
##            fe_lonlat           fe_payment           fe_product 
##           0.00000000           8.48458151           1.83541540 
##           GenderMale                 hour             latitude 
##           1.00698081           3.49894062           0.00000000 
##            longitude                month    PaymentCreditCard 
##           1.35292374           0.06094276           0.06981940 
##       PaymentEwallet PLFashionAccessories      PLFoodBeverages 
##           0.00000000           0.00000000           0.86212968 
##       PLHealthBeauty      PLHomeLifestyle       PLSportsTravel 
##           0.00000000           0.45846986           0.32577381 
##             Quantity               Rating                 Tax5 
##           2.85153048           7.97665258          12.63500103 
##                 tmed                Total            UnitPrice 
##          10.51587883           0.00000000           8.02500179 
##                 week 
##           3.25825118
pred = predict(model, my_test1)
print(names(pred))
## [1] "formula"   "votes"     "prob"      "class"     "confusion" "error"
print(pred$confusion)
##                Observed Class
## Predicted Class   1   2
##               1 202  14
##               2  15  69
print(pred$error)
## [1] 0.09666667
pred_valid_ADABAG <- pred$class

cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_ADABAG))
cfm
##   Obj Pred Freq
## 1   1    1  202
## 2   2    1   14
## 3   1    2   15
## 4   2    2   69
# Errores de clasificacion
errclasADABAG<- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasADABAG ))
## [1] "Errores de clasificación: 29"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predADABAG = mean(pred_valid_ADABAG == my_test1$cluster)
predADABAG
## [1] 0.9033333
print(paste0("Porcentaje de acierto/Accuracy: ",predADABAG )) 
## [1] "Porcentaje de acierto/Accuracy: 0.903333333333333"
pred1 <- prediction(as.numeric(pred_valid_ADABAG), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.881100438620843"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.930875576036866"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.831325301204819"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.935185185185185"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.821428571428571"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.933025404157044"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.826347305389222"
z5 <-data.frame(modelo = ('adabag'),
                        accuracy =c(predADABAG),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasADABAG)
                        )
RESULTADOS <-rbind(RESULTADOS,z5)

#probabilidad de predecir cada clase
result = data.frame(my_test1$cluster, pred$prob, pred$class)
print(result)
##     my_test1.cluster         X1         X2 pred.class
## 1                  1 0.98044479 0.01955521          1
## 2                  1 0.65719870 0.34280130          1
## 3                  1 0.84465085 0.15534915          1
## 4                  1 0.88509134 0.11490866          1
## 5                  1 0.82103833 0.17896167          1
## 6                  1 0.67525775 0.32474225          1
## 7                  1 0.87795282 0.12204718          1
## 8                  1 0.69595980 0.30404020          1
## 9                  1 0.87067310 0.12932690          1
## 10                 1 0.37756652 0.62243348          2
## 11                 1 0.72783974 0.27216026          1
## 12                 1 0.85719189 0.14280811          1
## 13                 1 0.84544412 0.15455588          1
## 14                 1 0.74537205 0.25462795          1
## 15                 1 0.79382014 0.20617986          1
## 16                 2 0.63579342 0.36420658          1
## 17                 1 0.73538188 0.26461812          1
## 18                 1 0.85851076 0.14148924          1
## 19                 1 0.83021034 0.16978966          1
## 20                 1 0.94456789 0.05543211          1
## 21                 1 0.66026878 0.33973122          1
## 22                 2 0.67174560 0.32825440          1
## 23                 1 1.00000000 0.00000000          1
## 24                 1 0.74533376 0.25466624          1
## 25                 1 0.76903742 0.23096258          1
## 26                 1 0.78237583 0.21762417          1
## 27                 1 0.77195257 0.22804743          1
## 28                 1 0.53978897 0.46021103          1
## 29                 1 0.89159380 0.10840620          1
## 30                 1 0.86567834 0.13432166          1
## 31                 1 0.82477625 0.17522375          1
## 32                 1 0.60470923 0.39529077          1
## 33                 1 0.82043569 0.17956431          1
## 34                 1 0.87925919 0.12074081          1
## 35                 1 0.57908722 0.42091278          1
## 36                 1 0.84304697 0.15695303          1
## 37                 1 0.90036947 0.09963053          1
## 38                 1 0.68300615 0.31699385          1
## 39                 2 0.45580125 0.54419875          2
## 40                 1 0.90522610 0.09477390          1
## 41                 1 0.76918380 0.23081620          1
## 42                 1 0.75172238 0.24827762          1
## 43                 1 0.79917020 0.20082980          1
## 44                 1 0.88517266 0.11482734          1
## 45                 2 0.17418392 0.82581608          2
## 46                 1 0.84489194 0.15510806          1
## 47                 1 0.55950838 0.44049162          1
## 48                 1 0.95765517 0.04234483          1
## 49                 2 0.33289662 0.66710338          2
## 50                 2 0.29193571 0.70806429          2
## 51                 1 0.74044540 0.25955460          1
## 52                 1 0.42447065 0.57552935          2
## 53                 1 0.87850362 0.12149638          1
## 54                 1 0.86321446 0.13678554          1
## 55                 1 0.44030269 0.55969731          2
## 56                 2 0.46875616 0.53124384          2
## 57                 1 0.94066313 0.05933687          1
## 58                 1 0.86387990 0.13612010          1
## 59                 1 0.83393174 0.16606826          1
## 60                 1 0.88280893 0.11719107          1
## 61                 1 0.93752696 0.06247304          1
## 62                 1 0.89304743 0.10695257          1
## 63                 2 0.29810034 0.70189966          2
## 64                 1 0.85679078 0.14320922          1
## 65                 2 0.49296422 0.50703578          2
## 66                 1 0.76090245 0.23909755          1
## 67                 2 0.58718727 0.41281273          1
## 68                 1 0.81579296 0.18420704          1
## 69                 1 0.88149561 0.11850439          1
## 70                 1 0.78109662 0.21890338          1
## 71                 1 0.87559865 0.12440135          1
## 72                 1 0.83534798 0.16465202          1
## 73                 1 0.70094347 0.29905653          1
## 74                 1 0.93845853 0.06154147          1
## 75                 2 0.39393461 0.60606539          2
## 76                 1 0.77393195 0.22606805          1
## 77                 2 0.56496819 0.43503181          1
## 78                 1 0.64280600 0.35719400          1
## 79                 1 0.86487183 0.13512817          1
## 80                 1 0.82146535 0.17853465          1
## 81                 2 0.23398233 0.76601767          2
## 82                 1 0.70897289 0.29102711          1
## 83                 1 0.90426470 0.09573530          1
## 84                 1 0.69009484 0.30990516          1
## 85                 2 0.42362262 0.57637738          2
## 86                 1 0.84429444 0.15570556          1
## 87                 1 0.72654995 0.27345005          1
## 88                 1 0.71636171 0.28363829          1
## 89                 1 0.81752170 0.18247830          1
## 90                 2 0.34730122 0.65269878          2
## 91                 2 0.26576042 0.73423958          2
## 92                 2 0.31485304 0.68514696          2
## 93                 2 0.28321468 0.71678532          2
## 94                 1 0.47486420 0.52513580          2
## 95                 1 0.77925697 0.22074303          1
## 96                 1 0.44735913 0.55264087          2
## 97                 1 0.92002788 0.07997212          1
## 98                 1 0.51885586 0.48114414          1
## 99                 2 0.38727675 0.61272325          2
## 100                1 0.47038133 0.52961867          2
## 101                1 0.75834099 0.24165901          1
## 102                1 0.82361396 0.17638604          1
## 103                1 0.52383583 0.47616417          1
## 104                1 0.71247243 0.28752757          1
## 105                2 0.27633649 0.72366351          2
## 106                2 0.31529910 0.68470090          2
## 107                1 0.86308501 0.13691499          1
## 108                2 0.34363523 0.65636477          2
## 109                1 0.63188820 0.36811180          1
## 110                2 0.21805036 0.78194964          2
## 111                1 0.86935189 0.13064811          1
## 112                1 0.57236525 0.42763475          1
## 113                1 0.54188346 0.45811654          1
## 114                1 0.74872076 0.25127924          1
## 115                1 0.93718889 0.06281111          1
## 116                1 0.90205014 0.09794986          1
## 117                1 0.82279648 0.17720352          1
## 118                1 0.71355729 0.28644271          1
## 119                1 0.89257065 0.10742935          1
## 120                2 0.29803792 0.70196208          2
## 121                2 0.49948245 0.50051755          2
## 122                1 0.86411343 0.13588657          1
## 123                1 0.92427702 0.07572298          1
## 124                1 0.68940179 0.31059821          1
## 125                1 0.78588045 0.21411955          1
## 126                1 0.70703190 0.29296810          1
## 127                1 0.78272970 0.21727030          1
## 128                1 0.83921883 0.16078117          1
## 129                1 0.62152915 0.37847085          1
## 130                2 0.50944058 0.49055942          1
## 131                1 0.89182988 0.10817012          1
## 132                1 0.94343071 0.05656929          1
## 133                1 0.78643067 0.21356933          1
## 134                1 0.90624065 0.09375935          1
## 135                1 0.84101654 0.15898346          1
## 136                1 0.96519064 0.03480936          1
## 137                1 0.97917164 0.02082836          1
## 138                1 0.90456314 0.09543686          1
## 139                1 0.87054760 0.12945240          1
## 140                1 0.68068381 0.31931619          1
## 141                1 0.98303948 0.01696052          1
## 142                1 0.84364465 0.15635535          1
## 143                1 0.94026440 0.05973560          1
## 144                1 0.70068669 0.29931331          1
## 145                1 0.87501129 0.12498871          1
## 146                1 0.76365204 0.23634796          1
## 147                1 0.27369253 0.72630747          2
## 148                1 0.50973397 0.49026603          1
## 149                1 0.88042793 0.11957207          1
## 150                2 0.60788478 0.39211522          1
## 151                1 0.79657147 0.20342853          1
## 152                1 0.86042181 0.13957819          1
## 153                2 0.17561941 0.82438059          2
## 154                1 0.74364689 0.25635311          1
## 155                1 0.83112958 0.16887042          1
## 156                1 0.87796524 0.12203476          1
## 157                1 0.68095381 0.31904619          1
## 158                1 0.74841007 0.25158993          1
## 159                1 0.53205151 0.46794849          1
## 160                1 0.80065240 0.19934760          1
## 161                1 0.74748940 0.25251060          1
## 162                1 0.71490212 0.28509788          1
## 163                2 0.40706453 0.59293547          2
## 164                1 0.66357758 0.33642242          1
## 165                1 0.84069592 0.15930408          1
## 166                1 0.90614964 0.09385036          1
## 167                1 0.88653756 0.11346244          1
## 168                1 0.80676697 0.19323303          1
## 169                1 0.78301156 0.21698844          1
## 170                2 0.13202779 0.86797221          2
## 171                2 0.29566826 0.70433174          2
## 172                1 0.41959602 0.58040398          2
## 173                2 0.31651790 0.68348210          2
## 174                2 0.48356159 0.51643841          2
## 175                1 0.94401337 0.05598663          1
## 176                2 0.45602587 0.54397413          2
## 177                1 0.46200831 0.53799169          2
## 178                1 0.39196416 0.60803584          2
## 179                2 0.49726587 0.50273413          2
## 180                1 0.64529322 0.35470678          1
## 181                2 0.24682300 0.75317700          2
## 182                1 0.72048090 0.27951910          1
## 183                1 0.73982570 0.26017430          1
## 184                1 0.72578471 0.27421529          1
## 185                1 0.83967392 0.16032608          1
## 186                2 0.09727077 0.90272923          2
## 187                2 0.23006488 0.76993512          2
## 188                2 0.15427183 0.84572817          2
## 189                2 0.01696052 0.98303948          2
## 190                1 0.72165895 0.27834105          1
## 191                2 0.17419515 0.82580485          2
## 192                2 0.17786808 0.82213192          2
## 193                1 0.51028818 0.48971182          1
## 194                2 0.37741936 0.62258064          2
## 195                1 0.53651474 0.46348526          1
## 196                1 0.47165875 0.52834125          2
## 197                1 0.70490064 0.29509936          1
## 198                1 0.50949007 0.49050993          1
## 199                1 0.76133652 0.23866348          1
## 200                1 0.71761376 0.28238624          1
## 201                1 0.97799229 0.02200771          1
## 202                1 0.64610033 0.35389967          1
## 203                1 0.81390759 0.18609241          1
## 204                1 0.67015518 0.32984482          1
## 205                1 0.92418157 0.07581843          1
## 206                1 0.80742336 0.19257664          1
## 207                1 0.77262900 0.22737100          1
## 208                1 0.90194271 0.09805729          1
## 209                1 0.96405239 0.03594761          1
## 210                1 0.65732347 0.34267653          1
## 211                2 0.54113726 0.45886274          1
## 212                1 0.88222737 0.11777263          1
## 213                1 0.70548357 0.29451643          1
## 214                2 0.40498594 0.59501406          2
## 215                1 0.84026703 0.15973297          1
## 216                2 0.09364869 0.90635131          2
## 217                1 0.84149769 0.15850231          1
## 218                1 0.92126921 0.07873079          1
## 219                2 0.54061876 0.45938124          1
## 220                1 0.71925762 0.28074238          1
## 221                1 0.91641207 0.08358793          1
## 222                1 0.86912957 0.13087043          1
## 223                1 0.90161560 0.09838440          1
## 224                1 0.97852177 0.02147823          1
## 225                1 0.70785463 0.29214537          1
## 226                1 0.70660916 0.29339084          1
## 227                1 0.90546969 0.09453031          1
## 228                1 0.94158507 0.05841493          1
## 229                1 0.83590433 0.16409567          1
## 230                2 0.39777233 0.60222767          2
## 231                1 0.88805921 0.11194079          1
## 232                1 1.00000000 0.00000000          1
## 233                2 0.38615503 0.61384497          2
## 234                2 0.43601802 0.56398198          2
## 235                1 0.72189106 0.27810894          1
## 236                2 0.64811031 0.35188969          1
## 237                2 0.40748631 0.59251369          2
## 238                2 0.57294670 0.42705330          1
## 239                1 0.84490652 0.15509348          1
## 240                1 0.49678086 0.50321914          2
## 241                1 0.74871891 0.25128109          1
## 242                1 0.72976362 0.27023638          1
## 243                1 0.81659507 0.18340493          1
## 244                2 0.31099522 0.68900478          2
## 245                1 0.87725456 0.12274544          1
## 246                1 0.72164706 0.27835294          1
## 247                1 0.90510091 0.09489909          1
## 248                2 0.55846964 0.44153036          1
## 249                2 0.39852599 0.60147401          2
## 250                1 0.69617343 0.30382657          1
## 251                1 0.76532590 0.23467410          1
## 252                1 0.70305767 0.29694233          1
## 253                2 0.13920397 0.86079603          2
## 254                2 0.46293283 0.53706717          2
## 255                1 0.87668590 0.12331410          1
## 256                2 0.47389257 0.52610743          2
## 257                1 0.42504843 0.57495157          2
## 258                1 0.60112348 0.39887652          1
## 259                1 0.90088705 0.09911295          1
## 260                1 0.87822934 0.12177066          1
## 261                1 0.76531207 0.23468793          1
## 262                1 0.86047413 0.13952587          1
## 263                2 0.55164982 0.44835018          1
## 264                1 0.58014069 0.41985931          1
## 265                1 0.67719393 0.32280607          1
## 266                1 0.92503317 0.07496683          1
## 267                1 0.80566775 0.19433225          1
## 268                2 0.43496565 0.56503435          2
## 269                2 0.33326066 0.66673934          2
## 270                1 0.79867035 0.20132965          1
## 271                1 0.67811675 0.32188325          1
## 272                2 0.22354111 0.77645889          2
## 273                1 0.82372081 0.17627919          1
## 274                2 0.22086552 0.77913448          2
## 275                2 0.28833566 0.71166434          2
## 276                1 0.66028418 0.33971582          1
## 277                1 0.82057562 0.17942438          1
## 278                1 0.56606633 0.43393367          1
## 279                2 0.43850708 0.56149292          2
## 280                2 0.34375869 0.65624131          2
## 281                2 0.44940465 0.55059535          2
## 282                1 0.60038364 0.39961636          1
## 283                2 0.28083994 0.71916006          2
## 284                2 0.31348397 0.68651603          2
## 285                2 0.56170112 0.43829888          1
## 286                2 0.46961605 0.53038395          2
## 287                2 0.39558286 0.60441714          2
## 288                2 0.13942190 0.86057810          2
## 289                2 0.32028520 0.67971480          2
## 290                2 0.49348296 0.50651704          2
## 291                2 0.26125598 0.73874402          2
## 292                2 0.51951119 0.48048881          1
## 293                2 0.42594748 0.57405252          2
## 294                1 0.33731002 0.66268998          2
## 295                1 0.47164062 0.52835938          2
## 296                2 0.21735253 0.78264747          2
## 297                2 0.19592720 0.80407280          2
## 298                2 0.35786931 0.64213069          2
## 299                1 0.51146558 0.48853442          1
## 300                2 0.00000000 1.00000000          2

8.9 Random Forest

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))

my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)


set.seed(1234)

#ELIMINAMOS VARIABLES COLINEALES Y MEJORAN LOS RESULTADOS
clasificadorRF <- randomForest(cluster ~ ., data = my_train1[,-4:-5], ntree = 250)

pred_valid_RF <- predict(clasificadorRF, newdata = my_test1[,-4:-5])

cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_RF))
cfm
##   Obj Pred Freq
## 1   1    1  208
## 2   2    1   19
## 3   1    2    9
## 4   2    2   64
# Errores de clasificacion
errclasRF <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasRF ))
## [1] "Errores de clasificación: 28"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predRF = mean(pred_valid_RF == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predRF ))
## [1] "Porcentaje de acierto/Accuracy: 0.906666666666667"
pred1 <- prediction(as.numeric(pred_valid_RF), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.864804841485759"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.95852534562212"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.771084337349398"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.916299559471366"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.876712328767123"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.936936936936937"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.82051282051282"
z6 <-data.frame(modelo = ('randomForest'),
                        accuracy =c(predRF),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasRF)
                        )
RESULTADOS <-rbind(RESULTADOS,z6)

8.10 Ranger

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))

#MODELO RANGER

# Grid de hiperparametros
hyper_grid <- expand.grid(
  n.trees = c(100,150,250,500,750,2000),
  node_size  = seq(3, 9, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE   = 0
)

# Numero total de combinaciones
nrow(hyper_grid)
## [1] 96
set.seed(1234)

for(i in 1:nrow(hyper_grid)) {
  
  # Modelo
  model <- ranger(
    formula         = cluster ~ ., 
    data            = my_train1, 
    num.trees       = hyper_grid$n.trees[i],
    min.node.size   = hyper_grid$node_size[i],
    sample.fraction = hyper_grid$sampe_size[i],
    seed            = 1234, 
    write.forest = TRUE,
    splitrule = "gini",
    verbose = TRUE,
    classification = TRUE,
    keep.inbag = TRUE
  )
  
  # añadimos OOB error
  hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}

# MEJORES PARAMETROS
hyper_grid %>% 
  dplyr::arrange(OOB_RMSE) %>%
  head(10)
##    n.trees node_size sampe_size  OOB_RMSE
## 1      750         3      0.800 0.3295018
## 2      750         7      0.800 0.3338092
## 3     2000         3      0.632 0.3359422
## 4     2000         7      0.632 0.3359422
## 5      750         3      0.632 0.3380617
## 6      100         9      0.632 0.3380617
## 7      100         3      0.632 0.3401680
## 8      100         7      0.632 0.3401680
## 9      250         9      0.632 0.3401680
## 10     750         5      0.800 0.3401680
#MODELO CON LOS MEJORES PARAMETROS PROPORCIONADOS POR EL GRID ANTERIOR, CON EL VALOR DE OOB_RMSE MÁS BAJO
set.seed(1234)
fit <- ranger(
              cluster ~. ,
              data = my_train1,
              num.trees = 750, 
              importance = 'impurity',
              write.forest = TRUE,
              min.node.size = 3,
              sample.fraction = .8, 
              splitrule = "gini",
              verbose = TRUE,
              classification = TRUE,
              keep.inbag = TRUE
            )

fit
## Ranger result
## 
## Call:
##  ranger(cluster ~ ., data = my_train1, num.trees = 750, importance = "impurity",      write.forest = TRUE, min.node.size = 3, sample.fraction = 0.8,      splitrule = "gini", verbose = TRUE, classification = TRUE,      keep.inbag = TRUE) 
## 
## Type:                             Classification 
## Number of trees:                  750 
## Sample size:                      700 
## Number of independent variables:  31 
## Mtry:                             5 
## Target node size:                 3 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             11.43 %
# variables importantes
vars_imp <- fit$variable.importance
vars_imp <- as.data.frame(vars_imp)
vars_imp$myvar <- rownames(vars_imp)
vars_imp <- as.data.table (vars_imp)
setorder(vars_imp, -vars_imp)

#importancia de las variables

library(ggpubr) 
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:ggimage':
## 
##     theme_transparent
## The following object is masked from 'package:cvms':
## 
##     font
## The following object is masked from 'package:plyr':
## 
##     mutate
ggbarplot(vars_imp[1:10],
          x = "myvar", y = "vars_imp",
          #fill  = 'myvar',
          color = "blue",             # Set bar border colors to white
          palette = "jco",            # jco journal color palett. see ?ggpar
          sort.val = "asc",          # Sort the value in descending order
          sort.by.groups = FALSE,     # Don't sort inside each group
          x.text.angle = 90,          # Rotate vertically x axis texts
          ylab = "Importancia",
          xlab = 'Variable', 
          #legend.title = "MPG Group",
          rotate = TRUE,
          ggtheme = theme_minimal()
          )

#evaluar modelo
valor_pred <- predict(fit, data = my_test1)

pred_valid_RANGER <- valor_pred$predictions
cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_RANGER))
cfm
##   Obj Pred Freq
## 1   1    1  206
## 2   2    1   20
## 3   1    2   11
## 4   2    2   63
# Error de clasificacion
errclasRANGER <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasRANGER ))
## [1] "Errores de clasificación: 31"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predRANGER = mean(pred_valid_RANGER == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predRANGER ))
## [1] "Porcentaje de acierto/Accuracy: 0.896666666666667"
pred1 <- prediction(as.numeric(pred_valid_RANGER), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.854172450169341"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.949308755760369"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.759036144578313"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.911504424778761"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.851351351351351"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.930022573363431"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.802547770700637"
z7 <-data.frame(modelo = ('ranger'),
                        accuracy =c(predRANGER),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasRANGER)
                        )
RESULTADOS <-rbind(RESULTADOS,z7)

8.11 Extreme Gradient Boosting

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))


#Hyper parametrox
cv.ctrl <- trainControl(method = "repeatedcv", repeats = 1,number = 3, 
                        #summaryFunction = twoClassSummary,
                        classProbs = FALSE,
                        allowParallel=T)

xgb.grid <- expand.grid(nrounds = 1000,
            max_depth = c(2,4,6,10),
            eta = c(0.05,0.1,0.2,0.5,1),
            gamma = c(0.1, 0.3),
            colsample_bytree = c(0.3, 0.5 , 0.7 ),
            min_child_weight = c(1, 3,5,7),
            subsample = c(0.25, 0.5,0.75,1))


set.seed(1234)
 
#Se comenta y descarga del modelo ya realizado, ya que dura más de 1hora
#xgb_tune1 <- train(as.factor(cluster) ~., data=my_train1, method="xgbTree", trControl=cv.ctrl, tuneGrid=xgb.grid, verbose=T, metric="Kappa", nthread =3)

#saveRDS(xgb_tune1, file = "xgb_tune1.rds")

xgb_tune1 <- readRDS(url('https://github.com/Juanmick/TFM/blob/master/xgb_tune1.rds?raw=true'))

# Best tuning parameter
xgb_tune1$bestTune
##     nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 770    1000         2 0.2   0.1              0.3                1       0.5
set.seed(1234)

# Make predictions on the test data
pred_valid_XGB <- xgb_tune1 %>% predict(my_test1)


cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_XGB))
cfm
##   Obj Pred Freq
## 1   1    1  204
## 2   2    1    8
## 3   1    2   13
## 4   2    2   75
# Errores de clasificacion
errclasXGB <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasXGB ))
## [1] "Errores de clasificación: 21"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predXGB = mean(pred_valid_XGB == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predXGB ))
## [1] "Porcentaje de acierto/Accuracy: 0.93"
pred1 <- prediction(as.numeric(pred_valid_XGB), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.921853311864972"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.940092165898618"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.903614457831325"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.962264150943396"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.852272727272727"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.951048951048951"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.87719298245614"
z8 <-data.frame(modelo = ('xgb'),
                        accuracy =c(predXGB),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasXGB)
                        )
RESULTADOS <-rbind(RESULTADOS,z8)

#variables importantes
varImp(xgb_tune1)
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 31)
## 
##                    Overall
## daynum             100.000
## Tax5                72.798
## fe_payment          63.885
## tmed                59.014
## fe_customer         42.715
## UnitPrice           38.791
## fe_gender           36.604
## GenderMale          35.302
## day                 35.144
## Rating              32.096
## week                31.460
## longitude           27.619
## Total               25.814
## cogs                24.276
## fe_city             20.136
## CustomerTypeNormal  18.598
## PaymentEwallet      17.959
## fe_lonlat           17.905
## hour                13.280
## latitude             8.838

8.12 Gradient Boosting Machine

my_test1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_test1.rds?raw=true"))
my_train1 <- readRDS(url("https://github.com/Juanmick/TFM/blob/master/my_train1.rds?raw=true"))


my_test1$cluster <- as.factor(my_test1$cluster)
my_train1$cluster <- as.factor(my_train1$cluster)


set.seed(1234)
#fold cross validation para validar modelos
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)

set.seed(1234)
gbmFit1 <- train(cluster~ ., data = my_train1, method = "gbm", trControl = fitControl, verbose = FALSE)
                 ## This last option is actually one
                 ## for gbm() that passes through
                 
gbmFit1
## Stochastic Gradient Boosting 
## 
## 700 samples
##  31 predictor
##   2 classes: '1', '2' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 631, 630, 630, 629, 629, 630, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8724511  0.6989401
##   1                  100      0.9207286  0.8186554
##   1                  150      0.9356013  0.8546552
##   2                   50      0.8990269  0.7666810
##   2                  100      0.9313213  0.8454439
##   2                  150      0.9350055  0.8545164
##   3                   50      0.9024722  0.7766249
##   3                  100      0.9253290  0.8315406
##   3                  150      0.9327419  0.8494871
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  1, shrinkage = 0.1 and n.minobsinnode = 10.
#mejor interaction.depth 2
trellis.par.set(caretTheme())
plot(gbmFit1)  

#100 arboles y 2 iteraciones,shrinkage = 0.1 y  n.minobsinnode = 10.
trellis.par.set(caretTheme())
plot(gbmFit1, metric = "Kappa", plotType = "level",
     scales = list(x = list(rot = 90)))

set.seed(1234)
pred_valid_GBM <- predict(gbmFit1, my_test1)


cfm <- as.data.frame(table(Obj = my_test1$cluster, Pred = pred_valid_GBM))
cfm
##   Obj Pred Freq
## 1   1    1  211
## 2   2    1   13
## 3   1    2    6
## 4   2    2   70
# Error de clasificacion
errclasGBM <- cfm[2,3]+cfm[3,3]
print(paste0("Errores de clasificación: ",errclasGBM ))
## [1] "Errores de clasificación: 19"
# Matriz de confusion
plot_confusion_matrix(cfm, 
                      targets_col = "Obj", 
                      predictions_col = "Pred",
                      counts_col = "Freq")

predGBM = mean(pred_valid_GBM == my_test1$cluster)
print(paste0("Porcentaje de acierto/Accuracy: ",predGBM ))
## [1] "Porcentaje de acierto/Accuracy: 0.936666666666667"
pred1 <- prediction(as.numeric(pred_valid_GBM), as.numeric(my_test1$cluster))

# CURVA ROC 'False positive rate' vs. 'True positive rate' -> CUANTO MAS ARRIBA A LA IZQUIERDA MEJOR
ROC.perf <- performance(pred1, "tpr", "fpr");
plot (ROC.perf);

# AREA BAJO LA CURVA ROC      -> MEJOR CUANTO MAS CERCANO A 1 Y AL MENOS SUPERIOR A 0.7
auc.tmp <- performance(pred1,"auc");
auc <- as.numeric(auc.tmp@y.values)
print(paste0("Valor AUC: ",auc ))
## [1] "Valor AUC: 0.907861862195325"
#¿Que proporcion de los clasificados del grupo X lo son realmente? ALTA = POCOS FALSOS POSITIVOS

#Precision grupo 1
prec1 = cfm[1,3]/(cfm[1,3]+cfm[3,3])
print(paste0("Precision Grupo 1: ",prec1 ))
## [1] "Precision Grupo 1: 0.972350230414747"
#Precision grupo 2
prec2 = cfm[4,3]/(cfm[4,3]+cfm[2,3])
print(paste0("Precision Grupo 2: ",prec2 ))
## [1] "Precision Grupo 2: 0.843373493975904"
#¿Que proporcion de los que son del grupo X se clasifican como tal? ALTA = POCOS FALSOS NEGATIVOS

# RECALL GRUPO 1
rec1 = cfm[1,3]/(cfm[1,3]+cfm[2,3])
print(paste0("Recall Grupo 1: ",rec1 ))
## [1] "Recall Grupo 1: 0.941964285714286"
# RECALL GRUPO 2
rec2 = cfm[4,3]/(cfm[3,3]+cfm[4,3])
print(paste0("Recall Grupo 2: ",rec2 ))
## [1] "Recall Grupo 2: 0.921052631578947"
#F1 SCORE GRUPO 1
f1gr1 = 2 * prec1 * rec1 / (prec1 + rec1)
print(paste0("F1 score Grupo 1: ",f1gr1 ))
## [1] "F1 score Grupo 1: 0.956916099773243"
#F2 SCORE GRUPO 2
f1gr2 = 2 * prec2 * rec2 / (prec2 + rec2)
print(paste0("F1 score Grupo 2: ",f1gr2 ))
## [1] "F1 score Grupo 2: 0.880503144654088"
z9 <-data.frame(modelo = ('gbm'),
                        accuracy =c(predGBM),
                        AUC =c(auc),
                        precision1 =c(prec1),
                        precision2 =c(prec2),
                        recall1 =c(rec1),
                        recall2 =c(rec2),
                        f1grupo1 =c(f1gr1),
                        f1grupo2 =c(f1gr2),
                        errorClas =c(errclasGBM)
                        )
RESULTADOS <-rbind(RESULTADOS,z9)

9. Resultados

kable(RESULTADOS[order(-RESULTADOS$accuracy),], digits = 2)
modelo accuracy AUC precision1 precision2 recall1 recall2 f1grupo1 f1grupo2 errorClas
3 glm 0.98 0.98 0.98 0.99 1.00 0.94 0.99 0.96 6
6 lda 0.96 0.96 0.97 0.95 0.98 0.92 0.97 0.93 11
2 svm 0.94 0.93 0.96 0.90 0.96 0.89 0.96 0.90 17
11 gbm 0.94 0.91 0.97 0.84 0.94 0.92 0.96 0.88 19
10 xgb 0.93 0.92 0.94 0.90 0.96 0.85 0.95 0.88 21
8 randomForest 0.91 0.86 0.96 0.77 0.92 0.88 0.94 0.82 28
7 adabag 0.90 0.88 0.93 0.83 0.94 0.82 0.93 0.83 29
9 ranger 0.90 0.85 0.95 0.76 0.91 0.85 0.93 0.80 31
4 NaiveBayes 0.85 0.83 0.88 0.78 0.91 0.71 0.89 0.74 45
5 rpart 0.80 0.76 0.85 0.66 0.87 0.62 0.86 0.64 61
1 knn 0.77 0.73 0.86 0.58 0.82 0.64 0.84 0.61 69
#EXPORTAMOS RESULTADOS A UN EXCEL
#library("writexl")
#write_xlsx(RESULTADOS,"C:\\TFM\\resultados.xlsx")